From 64235468ce4e6081bdc5f3d45df5157a7b50cf37 Mon Sep 17 00:00:00 2001 From: Christian Holm Date: Sat, 9 Mar 2013 13:03:55 +0100 Subject: [PATCH 001/479] Handle invalid literals for start attribute --- html2text.py | 9 ++++++--- test/invalid_start.html | 8 ++++++++ test/invalid_start.md | 3 +++ 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 test/invalid_start.html create mode 100644 test/invalid_start.md diff --git a/html2text.py b/html2text.py index 17528901..0d32f9c8 100755 --- a/html2text.py +++ b/html2text.py @@ -174,9 +174,12 @@ def google_fixed_width_font(style): def list_numbering_start(attrs): """extract numbering from list element attributes""" if 'start' in attrs: - return int(attrs['start']) - 1 - else: - return 0 + try: + return int(attrs['start']) - 1 + except ValueError: + pass + + return 0 class HTML2Text(HTMLParser.HTMLParser): def __init__(self, out=None, baseurl=''): diff --git a/test/invalid_start.html b/test/invalid_start.html new file mode 100644 index 00000000..0e49fe53 --- /dev/null +++ b/test/invalid_start.html @@ -0,0 +1,8 @@ + + +
    +
  1. The ol has an invalid start
  2. +
  3. This should just be ignored
  4. +
+ + \ No newline at end of file diff --git a/test/invalid_start.md b/test/invalid_start.md new file mode 100644 index 00000000..bed039c4 --- /dev/null +++ b/test/invalid_start.md @@ -0,0 +1,3 @@ + 1. The ol has an invalid start + 2. This should just be ignored + From 04858919624170f56b8ec473aebe74f77dbf924c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Sun, 17 Feb 2013 02:09:42 +0100 Subject: [PATCH 002/479] Make html2text.py compatible with python 3.* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Don't expect u'' strings to work. * Fix nbsp_unicode test. * Also remove workaround for python < 2.3 ... we really don’t need it anymore. * Fix
 in list tests.
  Replace onlywhite function with RE.

  I believe that Aaron by mistake put to the html2text.py his
  contribution to the Obfuscated Python Code Contest.

  For the reasons I have no chance to decipher, I have to use RE.
  More obvious replacement with para.isspace() doesn't work.

Signed-off-by: Matěj Cepl 
---
 html2text.py      | 85 +++++++++++++++++++++++++++++------------------
 test/run_tests.py |  8 ++---
 2 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/html2text.py b/html2text.py
index 17528901..1f099c71 100755
--- a/html2text.py
+++ b/html2text.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 """html2text: Turn HTML into equivalent Markdown-structured text."""
+from __future__ import division
 __version__ = "3.200.3"
 __author__ = "Aaron Swartz (me@aaronsw.com)"
 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
@@ -62,15 +63,13 @@ def has_key(x, y):
 IGNORE_EMPHASIS = False
 
 ### Entity Nonsense ###
+# For checking space-only lines on line 771
+SPACE_RE = re.compile(r'\s\+')
 
 def name2cp(k):
     if k == 'apos': return ord("'")
-    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
-        return htmlentitydefs.name2codepoint[k]
-    else:
-        k = htmlentitydefs.entitydefs[k]
-        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
-        return ord(codecs.latin_1_decode(k)[0])
+    return htmlentitydefs.name2codepoint[k]
+
 
 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
@@ -88,14 +87,6 @@ def name2cp(k):
     unifiable_n[name2cp(k)] = unifiable[k]
 
 ### End Entity Nonsense ###
-
-def onlywhite(line):
-    """Return true if the line does only consist of whitespace characters."""
-    for c in line:
-        if c is not ' ' and c is not '  ':
-            return c is ' '
-    return line
-
 def hn(tag):
     if tag[0] == 'h' and len(tag) == 2:
         try:
@@ -105,7 +96,10 @@ def hn(tag):
 
 def dumb_property_dict(style):
     """returns a hash of css attributes"""
-    return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
+    out = dict([(x.strip(), y.strip()) for x, y in
+        [z.split(':', 1) for z in
+            style.split(';') if ':' in z]]);
+    return out
 
 def dumb_css_parser(data):
     """returns a hash of css selectors, each of which contains a hash of css attributes"""
@@ -265,10 +259,19 @@ def close(self):
 
         self.outtext = self.outtext.join(self.outtextlist)
         if self.unicode_snob:
-            nbsp = unichr(name2cp('nbsp'))
+            try:
+                nbsp = unichr(name2cp('nbsp'))
+            except NameError:
+                nbsp = chr(name2cp('nbsp'))
         else:
-            nbsp = u' '
-        self.outtext = self.outtext.replace(u' _place_holder;', nbsp)
+            try:
+                nbsp = unichr(32)
+            except NameError:
+                nbsp = chr(32)
+        try:
+            self.outtext = self.outtext.replace(unicode(' _place_holder;'), nbsp)
+        except NameError:
+            self.outtext = self.outtext.replace(' _place_holder;', nbsp)
 
         return self.outtext
 
@@ -376,6 +379,7 @@ def handle_emphasis(self, start, tag_style, parent_style):
 
     def handle_tag(self, tag, attrs, start):
         #attrs = fixattrs(attrs)
+        # attrs is None for endtags
         if attrs is None:
             attrs = {}
         else:
@@ -578,7 +582,8 @@ def o(self, data, puredata=0, force=0):
 
         if not self.quiet:
             if self.google_doc:
-                # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
+                # prevent white space immediately after 'begin emphasis'
+                # marks ('**' and '_')
                 lstripped_data = data.lstrip()
                 if self.drop_white_space and not (self.pre or self.code):
                     data = lstripped_data
@@ -586,7 +591,10 @@ def o(self, data, puredata=0, force=0):
                     self.drop_white_space = 0
 
             if puredata and not self.pre:
-                data = re.sub('\s+', ' ', data)
+                # This is a very dangerous call ... it could mess up
+                # all handling of   when not handled properly
+                # (see entityref)
+                data = re.sub(r'\s+', r' ', data)
                 if data and data[0] == ' ':
                     self.space = 1
                     data = data[1:]
@@ -598,13 +606,14 @@ def o(self, data, puredata=0, force=0):
                     data = "\n" + data
 
             bq = (">" * self.blockquote)
-            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
+            if not (force and data and data[0] == ">") and self.blockquote:
+                bq += " "
 
             if self.pre:
                 if not self.list:
                     bq += "    "
                 #else: list content is already partially indented
-                for i in xrange(len(self.list)):
+                for i in range(len(self.list)):
                     bq += "    "
                 data = data.replace("\n", "\n"+bq)
 
@@ -633,19 +642,24 @@ def o(self, data, puredata=0, force=0):
                 if not self.lastWasNL: self.out(' ')
                 self.space = 0
 
-            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
+            if self.a and ((self.p_p == 2 and self.links_each_paragraph)
+                    or force == "end"):
                 if force == "end": self.out("\n")
 
                 newa = []
                 for link in self.a:
                     if self.outcount > link['outcount']:
-                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
-                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
+                        self.out("   ["+ str(link['count']) +"]: " +
+                            urlparse.urljoin(self.baseurl, link['href']))
+                        if 'title' in link:
+                            self.out(" ("+link['title']+")")
                         self.out("\n")
                     else:
                         newa.append(link)
 
-                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
+                # Don't need an extra line when nothing was done.
+                if self.a != newa:
+                    self.out("\n")
 
                 self.a = newa
 
@@ -699,10 +713,13 @@ def entityref(self, c):
             try: name2cp(c)
             except KeyError: return "&" + c + ';'
             else:
-                try:
-                    return unichr(name2cp(c))
-                except NameError: #Python3
-                    return chr(name2cp(c))
+                if c == 'nbsp':
+                    return unifiable[c]
+                else:
+                    try:
+                        return unichr(name2cp(c))
+                    except NameError: #Python3
+                        return chr(name2cp(c))
 
     def replaceEntities(self, s):
         s = s.group(1)
@@ -718,7 +735,7 @@ def google_nest_count(self, style):
         """calculate the nesting count of google doc lists"""
         nest_count = 0
         if 'margin-left' in style:
-            nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
+            nest_count = int(style['margin-left'][:-2]) // self.google_list_indent
         return nest_count
 
 
@@ -741,7 +758,11 @@ def optwrap(self, text):
                         result += "\n\n"
                         newlines = 2
                 else:
-                    if not onlywhite(para):
+                    # Warning for the tempted!!!
+                    # Be aware that obvious replacement of this with
+                    # line.isspace()
+                    # DOES NOT work! Explanations are welcome.
+                    if not SPACE_RE.match(para):
                         result += para + "\n"
                         newlines = 1
             else:
diff --git a/test/run_tests.py b/test/run_tests.py
index 7ebfd394..6aec3eb6 100644
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -20,11 +20,11 @@ def test_module(fn, google_doc=False, **kwargs):
         h.body_width = 0
         h.hide_strikethrough = True
 
-    for k, v in kwargs.iteritems():
+    for k, v in kwargs.items():
         setattr(h, k, v)
 
     result = get_baseline(fn)
-    actual = h.handle(file(fn).read())
+    actual = h.handle(open(fn).read())
     return print_result(fn, 'module', result, actual)
 
 def test_command(fn, *args):
@@ -51,7 +51,7 @@ def test_command(fn, *args):
         actual = re.sub(r'\r+', '\r', actual)
         actual = actual.replace('\r\n', '\n')
 
-    return print_result(fn, 'command', result, actual)
+    return print_result(fn, 'command', result, actual.decode('utf8'))
 
 def print_conditions(mode, *args, **kwargs):
     format = " * %s %s, %s: "
@@ -70,7 +70,7 @@ def print_result(fn, mode, result, actual):
         dump_name = get_dump_name(fn, mode)
 
         f = codecs.open(dump_name, encoding='utf-8', mode='w+')
-        f.write(actual)
+        f.write(actual.decode('utf8'))
 
         print("  Use: diff -u %s %s" % (get_baseline_name(fn), dump_name))
         return False

From 9125b6c5261f5cb5fde5b9d22ee121d2920813bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= 
Date: Fri, 8 Mar 2013 10:44:52 +0100
Subject: [PATCH 003/479] Switch tests to unittest (w/generated test cases)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Specifically:
  * add test generator
  * remove all direct printings to stdout (reporting should be done by
    the unittest)
  * close all open files (unfortunately, we cannot use with statement
    with python 2.4 compatibility)
  * iteritems -> items ... to be 3.* compatible (and we really don't
    need generators here)
  * fix Travis-CI metadata to work on whole supported range of python
    versions.
  * make installation of dependencies in travis-CI and other testing
    related stuff working independently from python version and both
    with unittest (for python >= 2.7) and unittest2 (for python < 2.7)

Signed-off-by: Matěj Cepl 
---
 .travis.yml            |   6 +-
 README.md              |   5 +-
 html2text.py           |   4 +-
 install_deps.py        |   6 ++
 setup.py               |  37 ++++++++++--
 test/__init__.py       |   0
 test/run_tests.py      | 124 ----------------------------------------
 test/test_html2text.py | 125 +++++++++++++++++++++++++++++++++++++++++
 8 files changed, 174 insertions(+), 133 deletions(-)
 create mode 100644 install_deps.py
 create mode 100644 test/__init__.py
 delete mode 100644 test/run_tests.py
 create mode 100644 test/test_html2text.py

diff --git a/.travis.yml b/.travis.yml
index e077f58d..8b6a6fd3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,4 +4,8 @@ python:
   - "2.6"
   - "2.7"
   - "pypy"
-script: cd test/; python run_tests.py
+  - "3.2"
+  - "3.3"
+before_script:
+  - "python install_deps.py"
+script: PYTHONPATH=$PYTHONPATH:. python test/test_html2text.py -v
diff --git a/README.md b/README.md
index e9b44605..0dcb5626 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,6 @@ _Originally written by Aaron Swartz. This code is distributed under the GPLv3._
 
 ## How to run unit tests
 
-    cd test/
-    python run_tests.py
+    python test/test_html2text.py -v
 
-[![Build Status](https://secure.travis-ci.org/aaronsw/html2text.png)](http://travis-ci.org/aaronsw/html2text)
+[![Build Status](https://secure.travis-ci.org/mcepl/html2text.png)](http://travis-ci.org/mcepl/html2text)
diff --git a/html2text.py b/html2text.py
index 1f099c71..c42e8e66 100755
--- a/html2text.py
+++ b/html2text.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python
+# coding: utf-8
 """html2text: Turn HTML into equivalent Markdown-structured text."""
 from __future__ import division
 __version__ = "3.200.3"
 __author__ = "Aaron Swartz (me@aaronsw.com)"
 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
-__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
+__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes",
+    "Kevin Jay North", "Matěj Cepl"]
 
 # TODO:
 #   Support decoded entities with unifiable.
diff --git a/install_deps.py b/install_deps.py
new file mode 100644
index 00000000..05c04333
--- /dev/null
+++ b/install_deps.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+import sys
+import subprocess
+
+if sys.version_info[:2] < (2,7):
+    subprocess.call('pip install unittest2 --use-mirrors', shell=True)
diff --git a/setup.py b/setup.py
index dd3d9bc2..7281e54e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,20 +1,47 @@
+# coding: utf-8
 import sys
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, Command
+requires_list = []
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest
+else:
+    if sys.version_info <= (2 , 6):
+        requires_list.append("unittest2")
+
+class RunTests(Command):
+    """New setup.py command to run all tests for the package.
+    """
+    description = "run all tests for the package"
+
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        tests = unittest.TestLoader().discover('.')
+        runner = unittest.TextTestRunner()
+        runner.run(tests)
 
 setup(
    name = "html2text",
-   version = "3.200.3",
+   version = "3.200.3.1",  # Temporarily adding the fourth revision number
    description = "Turn HTML into equivalent Markdown-structured text.",
    author = "Aaron Swartz",
    author_email = "me@aaronsw.com",
-   url='http://www.aaronsw.com/2002/html2text/',
+   url='https://github.com/aaronsw/html2text/',
+   cmdclass={'test': RunTests},
    classifiers=[
        'Development Status :: 5 - Production/Stable',
        'Intended Audience :: Developers',
        'License :: OSI Approved :: GNU General Public License (GPL)',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2',
-       'Programming Language :: Python :: 2.3',
        'Programming Language :: Python :: 2.4',
        'Programming Language :: Python :: 2.5',
        'Programming Language :: Python :: 2.6',
@@ -23,12 +50,14 @@
        'Programming Language :: Python :: 3.0',
        'Programming Language :: Python :: 3.1',
        'Programming Language :: Python :: 3.2'
+       'Programming Language :: Python :: 3.3'
      ],
     entry_points="""
         [console_scripts]
         html2text=html2text:main
     """,
    license='GNU GPL 3',
+   requires=requires_list,
    packages=find_packages(),
    py_modules=['html2text'],
    include_package_data=True,
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/test/run_tests.py b/test/run_tests.py
deleted file mode 100644
index 6aec3eb6..00000000
--- a/test/run_tests.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import codecs
-import glob
-import os
-import re
-import subprocess
-import sys
-
-sys.path.insert(0, '..')
-import html2text
-
-
-def test_module(fn, google_doc=False, **kwargs):
-    print_conditions('module', google_doc=google_doc, **kwargs)
-
-    h = html2text.HTML2Text()
-
-    if google_doc:
-        h.google_doc = True
-        h.ul_item_mark = '-'
-        h.body_width = 0
-        h.hide_strikethrough = True
-
-    for k, v in kwargs.items():
-        setattr(h, k, v)
-
-    result = get_baseline(fn)
-    actual = h.handle(open(fn).read())
-    return print_result(fn, 'module', result, actual)
-
-def test_command(fn, *args):
-    print_conditions('command', *args)
-    args = list(args)
-
-    cmd = [sys.executable or 'python', '../html2text.py']
-
-    if '--googledoc' in args:
-        args.remove('--googledoc')
-        cmd += ['-g', '-d', '-b', '0', '-s']
-
-    if args:
-        cmd.extend(args)
-
-    cmd += [fn]
-
-    result = get_baseline(fn)
-    actual = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.read()
-
-    if os.name == 'nt':
-        # Fix the unwanted CR to CRCRLF replacement
-        # during text pipelining on Windows/cygwin
-        actual = re.sub(r'\r+', '\r', actual)
-        actual = actual.replace('\r\n', '\n')
-
-    return print_result(fn, 'command', result, actual.decode('utf8'))
-
-def print_conditions(mode, *args, **kwargs):
-    format = " * %s %s, %s: "
-    sys.stdout.write(format % (mode, args, kwargs))
-
-def print_result(fn, mode, result, actual):
-    if result == actual:
-        print('PASS')
-        return True
-    else:
-        print('FAIL')
-
-        if mode == 'command':
-            print(len(result), len(actual))
-
-        dump_name = get_dump_name(fn, mode)
-
-        f = codecs.open(dump_name, encoding='utf-8', mode='w+')
-        f.write(actual.decode('utf8'))
-
-        print("  Use: diff -u %s %s" % (get_baseline_name(fn), dump_name))
-        return False
-
-def get_dump_name(fn, suffix):
-    return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix)
-
-def get_baseline_name(fn):
-    return os.path.splitext(fn)[0] + '.md'
-
-def get_baseline(fn):
-    name = get_baseline_name(fn)
-    f = codecs.open(name, mode='r', encoding='utf8')
-    return f.read()
-
-def run_all_tests():
-    html_files = glob.glob("*.html")
-    passing = True
-    for fn in html_files:
-        module_args = {}
-        cmdline_args = []
-
-        if fn.lower().startswith('google'):
-            module_args['google_doc'] = True
-            cmdline_args.append('--googledoc')
-
-        if fn.lower().find('unicode') >= 0:
-            module_args['unicode_snob'] = True
-
-        if fn.lower().find('flip_emphasis') >= 0:
-            module_args['emphasis_mark'] = '*'
-            module_args['strong_mark'] = '__'
-            cmdline_args.append('-e')
-
-        if fn.lower().find('escape_snob') >= 0:
-            module_args['escape_snob'] = True
-            cmdline_args.append('--escape-all')
-
-        print('\n' + fn + ':')
-        passing = passing and test_module(fn, **module_args)
-
-        if not 'unicode_snob' in module_args: # Because there is no command-line option to control unicode_snob
-            passing = passing and test_command(fn, *cmdline_args)
-    if passing:
-        print("ALL TESTS PASSED")
-    else:
-        print("Fail.")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    run_all_tests()
diff --git a/test/test_html2text.py b/test/test_html2text.py
new file mode 100644
index 00000000..8f227bc0
--- /dev/null
+++ b/test/test_html2text.py
@@ -0,0 +1,125 @@
+import codecs
+import glob
+import os
+import re
+import subprocess
+import sys
+if sys.version_info[:2] < (2,7):
+    import unittest2 as unittest
+else:
+    import unittest
+import logging
+logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
+    level=logging.DEBUG)
+
+import html2text
+
+def test_module(fn, google_doc=False, **kwargs):
+    h = html2text.HTML2Text()
+    h.fn = fn
+
+    if google_doc:
+        h.google_doc = True
+        h.ul_item_mark = '-'
+        h.body_width = 0
+        h.hide_strikethrough = True
+
+    for k, v in kwargs.items():
+        setattr(h, k, v)
+
+    result = get_baseline(fn)
+    inf = open(fn)
+    actual = h.handle(inf.read())
+    inf.close()
+    return result, actual
+
+def test_command(fn, *args):
+    args = list(args)
+    cmd_name = os.path.join(os.path.dirname(fn), '..', 'html2text.py')
+    cmd = [sys.executable, cmd_name]
+
+    if '--googledoc' in args:
+        args.remove('--googledoc')
+        cmd += ['-g', '-d', '-b', '0', '-s']
+
+    if args:
+        cmd.extend(args)
+
+    cmd += [fn]
+
+    result = get_baseline(fn)
+    pid = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out, _ = pid.communicate()
+    actual = out.decode()
+
+    if os.name == 'nt':
+        # Fix the unwanted CR to CRCRLF replacement
+        # during text pipelining on Windows/cygwin
+        actual = re.sub(r'\r+', '\r', actual)
+        actual = actual.replace('\r\n', '\n')
+
+    return result, actual
+
+def get_dump_name(fn, suffix):
+    return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix)
+
+def get_baseline_name(fn):
+    return os.path.splitext(fn)[0] + '.md'
+
+def get_baseline(fn):
+    name = get_baseline_name(fn)
+    f = codecs.open(name, mode='r', encoding='utf8')
+    out = f.read()
+    f.close()
+    return out
+
+class TestHTML2Text(unittest.TestCase):
+    pass
+
+def generate_test(fn):
+    def test_mod(self):
+        self.maxDiff = None
+        result, actual = test_module(fn, **module_args)
+        self.assertEqual(result, actual)
+
+    def test_cmd(self):
+        # Because there is no command-line option to control unicode_snob
+        if not 'unicode_snob' in module_args:
+            self.maxDiff = None
+            result, actual = test_command(fn, *cmdline_args)
+            self.assertEqual(result, actual)
+
+    module_args = {}
+    cmdline_args = []
+    base_fn = os.path.basename(fn).lower()
+
+    if base_fn.startswith('google'):
+        module_args['google_doc'] = True
+        cmdline_args.append('--googledoc')
+
+    if base_fn.find('unicode') >= 0:
+        module_args['unicode_snob'] = True
+
+    if base_fn.find('flip_emphasis') >= 0:
+        module_args['emphasis_mark'] = '*'
+        module_args['strong_mark'] = '__'
+        cmdline_args.append('-e')
+
+    if base_fn.find('escape_snob') >= 0:
+        module_args['escape_snob'] = True
+        cmdline_args.append('--escape-all')
+
+    return test_mod, test_cmd
+
+# Originally from http://stackoverflow.com/questions/32899/\
+#    how-to-generate-dynamic-parametrized-unit-tests-in-python
+test_dir_name = os.path.dirname(os.path.realpath(__file__))
+for fn in glob.glob("%s/*.html" % test_dir_name):
+    test_name = 'test_%s' % os.path.splitext(os.path.basename(fn))[0].lower()
+    test_m, test_c = generate_test(fn)
+    setattr(TestHTML2Text, test_name + "_mod", test_m)
+    if test_c:
+        setattr(TestHTML2Text, test_name + "_cmd", test_c)
+
+if __name__ == "__main__":
+    unittest.main()

From 8959f9770b6986022d1d2014c58ba4612641c7af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= 
Date: Fri, 8 Nov 2013 15:53:35 +0100
Subject: [PATCH 004/479] Remove .editorconfig

html2text.py doesn't comply with it anyway (e.g., html2text.py has Unix
EOLs not MS-DOS ones) and it breaks those who would actually have
support for editorconfig in their $EDITOR.
---
 .editorconfig | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 .editorconfig

diff --git a/.editorconfig b/.editorconfig
deleted file mode 100644
index d66cdedd..00000000
--- a/.editorconfig
+++ /dev/null
@@ -1,11 +0,0 @@
-; top-most EditorConfig file
-root = true
-
-; Unix-style newlines
-[*]
-end_of_line = CRLF
-
-; 4 space indentation
-[*.py]
-indent_style = space
-indent_size = 4

From 20e8e5e56783419a8b0dbbb30d479e878bed9d9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= 
Date: Tue, 30 Apr 2013 18:00:56 +0100
Subject: [PATCH 005/479] Remove depreceated function has_key.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Matěj Cepl 
---
 html2text.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/html2text.py b/html2text.py
index c42e8e66..9ff5a4d3 100755
--- a/html2text.py
+++ b/html2text.py
@@ -17,9 +17,6 @@
     setattr(__builtins__, 'True', 1)
     setattr(__builtins__, 'False', 0)
 
-def has_key(x, y):
-    if hasattr(x, 'has_key'): return x.has_key(y)
-    else: return y in x
 
 try:
     import htmlentitydefs
@@ -295,16 +292,16 @@ def previousIndex(self, attrs):
 
             If the set of attributes is not found, returns None
         """
-        if not has_key(attrs, 'href'): return None
+        if 'href' not in attrs: return None
 
         i = -1
         for a in self.a:
             i += 1
             match = 0
 
-            if has_key(a, 'href') and a['href'] == attrs['href']:
-                if has_key(a, 'title') or has_key(attrs, 'title'):
-                        if (has_key(a, 'title') and has_key(attrs, 'title') and
+            if ('href' in a) and a['href'] == attrs['href']:
+                if ('title' in a) or ('title' in attrs):
+                        if (('title' in a) and ('title' in attrs) and
                             a['title'] == attrs['title']):
                             match = True
                 else:
@@ -465,7 +462,7 @@ def handle_tag(self, tag, attrs, start):
             if start:
                 self.abbr_title = None
                 self.abbr_data = ''
-                if has_key(attrs, 'title'):
+                if ('title' in attrs):
                     self.abbr_title = attrs['title']
             else:
                 if self.abbr_title != None:
@@ -475,7 +472,7 @@ def handle_tag(self, tag, attrs, start):
 
         if tag == "a" and not self.ignore_links:
             if start:
-                if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
+                if ('href' in attrs) and not (self.skip_internal_links and attrs['href'].startswith('#')):
                     self.astack.append(attrs)
                     self.maybe_automatic_link = attrs['href']
                 else:
@@ -500,7 +497,7 @@ def handle_tag(self, tag, attrs, start):
                             self.o("][" + str(a['count']) + "]")
 
         if tag == "img" and start and not self.ignore_images:
-            if has_key(attrs, 'src'):
+            if ('src' in attrs):
                 attrs['href'] = attrs['src']
                 alt = attrs.get('alt', '')
                 self.o("![" + escape_md(alt) + "]")

From 113852e72321601ea25e7d8dd88f40ce07a93aab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= 
Date: Wed, 13 Nov 2013 00:42:19 +0100
Subject: [PATCH 006/479] PEP8ize the script.

This should be generally NOOP, just running the script through flake8
and fixing all highlighted warnings.
---
 html2text.py | 327 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 209 insertions(+), 118 deletions(-)

diff --git a/html2text.py b/html2text.py
index 9ff5a4d3..895a243a 100755
--- a/html2text.py
+++ b/html2text.py
@@ -6,7 +6,7 @@
 __author__ = "Aaron Swartz (me@aaronsw.com)"
 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes",
-    "Kevin Jay North", "Matěj Cepl"]
+                    "Kevin Jay North", "Matěj Cepl"]
 
 # TODO:
 #   Support decoded entities with unifiable.
@@ -22,23 +22,28 @@
     import htmlentitydefs
     import urlparse
     import HTMLParser
-except ImportError: #Python3
+except ImportError:  # Python3
     import html.entities as htmlentitydefs
     import urllib.parse as urlparse
     import html.parser as HTMLParser
-try: #Python3
+try:  # Python3
     import urllib.request as urllib
 except:
     import urllib
-import optparse, re, sys, codecs, types
+import optparse
+import re
+import sys
 
-try: from textwrap import wrap
-except: pass
+try:
+    from textwrap import wrap
+except:
+    pass
 
 # Use Unicode characters instead of their ascii psuedo-replacements
 UNICODE_SNOB = 0
 
-# Escape all special characters.  Output is less readable, but avoids corner case formatting issues.
+# Escape all special characters.  Output is less readable, but avoids
+# corner case formatting issues.
 ESCAPE_SNOB = 0
 
 # Put the links after each paragraph instead of at the end.
@@ -47,8 +52,8 @@
 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
 BODY_WIDTH = 78
 
-# Don't show internal links (href="#local-anchor") -- corresponding link targets
-# won't be visible in the plain text file anyway.
+# Don't show internal links (href="#local-anchor") -- corresponding link
+# targets won't be visible in the plain text file anyway.
 SKIP_INTERNAL_LINKS = True
 
 # Use inline, rather than reference, formatting for images and links
@@ -65,43 +70,52 @@
 # For checking space-only lines on line 771
 SPACE_RE = re.compile(r'\s\+')
 
+
 def name2cp(k):
-    if k == 'apos': return ord("'")
+    if k == 'apos':
+        return ord("'")
     return htmlentitydefs.name2codepoint[k]
 
 
-unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
-'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
-'ndash':'-', 'oelig':'oe', 'aelig':'ae',
-'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
-'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
-'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
-'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
-'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
-'lrm':'', 'rlm':''}
+unifiable = {'rsquo': "'", 'lsquo': "'", 'rdquo': '"', 'ldquo': '"',
+             'copy': '(C)', 'mdash': '--', 'nbsp': ' ', 'rarr': '->',
+             'larr': '<-', 'middot': '*', 'ndash': '-', 'oelig': 'oe',
+             'aelig': 'ae', 'agrave': 'a', 'aacute': 'a', 'acirc': 'a',
+             'atilde': 'a', 'auml': 'a', 'aring': 'a', 'egrave': 'e',
+             'eacute': 'e', 'ecirc': 'e', 'euml': 'e', 'igrave': 'i',
+             'iacute': 'i', 'icirc': 'i', 'iuml': 'i', 'ograve': 'o',
+             'oacute': 'o', 'ocirc': 'o', 'otilde': 'o', 'ouml': 'o',
+             'ugrave': 'u', 'uacute': 'u', 'ucirc': 'u', 'uuml': 'u',
+             'lrm': '', 'rlm': ''}
 
 unifiable_n = {}
 
 for k in unifiable.keys():
     unifiable_n[name2cp(k)] = unifiable[k]
 
+
 ### End Entity Nonsense ###
 def hn(tag):
     if tag[0] == 'h' and len(tag) == 2:
         try:
             n = int(tag[1])
-            if n in range(1, 10): return n
-        except ValueError: return 0
+            if n in range(1, 10):
+                return n
+        except ValueError:
+            return 0
+
 
 def dumb_property_dict(style):
     """returns a hash of css attributes"""
     out = dict([(x.strip(), y.strip()) for x, y in
-        [z.split(':', 1) for z in
-            style.split(';') if ':' in z]]);
+               [z.split(':', 1) for z in
+               style.split(';') if ':' in z]])
     return out
 
+
 def dumb_css_parser(data):
-    """returns a hash of css selectors, each of which contains a hash of css attributes"""
+    """returns a hash of css selectors, each of which contains a hash of
+    css attributes"""
     # remove @import sentences
     data += ';'
     importIndex = data.find('@import')
@@ -109,15 +123,18 @@ def dumb_css_parser(data):
         data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
         importIndex = data.find('@import')
 
-    # parse the css. reverted from dictionary compehension in order to support older pythons
-    elements =  [x.split('{') for x in data.split('}') if '{' in x.strip()]
+    # parse the css. reverted from dictionary compehension in order to
+    # support older pythons
+    elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
     try:
-        elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
+        elements = dict([(a.strip(), dumb_property_dict(b))
+                        for a, b in elements])
     except ValueError:
-        elements = {} # not that important
+        elements = {}  # not that important
 
     return elements
 
+
 def element_style(attrs, style_def, parent_style):
     """returns a hash of the 'final' style attributes of the element"""
     style = parent_style.copy()
@@ -130,6 +147,7 @@ def element_style(attrs, style_def, parent_style):
         style.update(immediate_style)
     return style
 
+
 def google_list_style(style):
     """finds out whether this is an ordered or unordered list"""
     if 'list-style-type' in style:
@@ -138,12 +156,15 @@ def google_list_style(style):
             return 'ul'
     return 'ol'
 
+
 def google_has_height(style):
-    """check if the style of the element has the 'height' attribute explicitly defined"""
+    """check if the style of the element has the 'height' attribute
+    explicitly defined"""
     if 'height' in style:
         return True
     return False
 
+
 def google_text_emphasis(style):
     """return a list of all emphasis modifiers of the element"""
     emphasis = []
@@ -155,6 +176,7 @@ def google_text_emphasis(style):
         emphasis.append(style['font-weight'])
     return emphasis
 
+
 def google_fixed_width_font(style):
     """check if the css of the current element defines a fixed width font"""
     font_family = ''
@@ -164,6 +186,7 @@ def google_fixed_width_font(style):
         return True
     return False
 
+
 def list_numbering_start(attrs):
     """extract numbering from list element attributes"""
     if 'start' in attrs:
@@ -171,6 +194,7 @@ def list_numbering_start(attrs):
     else:
         return 0
 
+
 class HTML2Text(HTMLParser.HTMLParser):
     def __init__(self, out=None, baseurl=''):
         HTMLParser.HTMLParser.__init__(self)
@@ -196,7 +220,8 @@ def __init__(self, out=None, baseurl=''):
         else:
             self.out = out
 
-        self.outtextlist = []  # empty list to store output characters before they are "joined"
+        # empty list to store output characters before they are "joined"
+        self.outtextlist = []
 
         try:
             self.outtext = unicode()
@@ -232,11 +257,12 @@ def __init__(self, out=None, baseurl=''):
         self.abbr_list = {}  # stack of abbreviations to write later
         self.baseurl = baseurl
 
-        try: del unifiable_n[name2cp('nbsp')]
-        except KeyError: pass
+        try:
+            del unifiable_n[name2cp('nbsp')]
+        except KeyError:
+            pass
         unifiable['nbsp'] = ' _place_holder;'
 
-
     def feed(self, data):
         data = data.replace("", "")
         HTMLParser.HTMLParser.feed(self, data)
@@ -248,7 +274,8 @@ def handle(self, data):
 
     def outtextf(self, s):
         self.outtextlist.append(s)
-        if s: self.lastWasNL = s[-1] == '\n'
+        if s:
+            self.lastWasNL = s[-1] == '\n'
 
     def close(self):
         HTMLParser.HTMLParser.close(self)
@@ -268,7 +295,8 @@ def close(self):
             except NameError:
                 nbsp = chr(32)
         try:
-            self.outtext = self.outtext.replace(unicode(' _place_holder;'), nbsp)
+            self.outtext = self.outtext.replace(unicode(' _place_holder;'),
+                                                nbsp)
         except NameError:
             self.outtext = self.outtext.replace(' _place_holder;', nbsp)
 
@@ -292,7 +320,8 @@ def previousIndex(self, attrs):
 
             If the set of attributes is not found, returns None
         """
-        if 'href' not in attrs: return None
+        if 'href' not in attrs:
+            return None
 
         i = -1
         for a in self.a:
@@ -302,12 +331,13 @@ def previousIndex(self, attrs):
             if ('href' in a) and a['href'] == attrs['href']:
                 if ('title' in a) or ('title' in attrs):
                         if (('title' in a) and ('title' in attrs) and
-                            a['title'] == attrs['title']):
+                                a['title'] == attrs['title']):
                             match = True
                 else:
                     match = True
 
-            if match: return i
+            if match:
+                return i
 
     def drop_last(self, nLetters):
         if not self.quiet:
@@ -319,11 +349,12 @@ def handle_emphasis(self, start, tag_style, parent_style):
         parent_emphasis = google_text_emphasis(parent_style)
 
         # handle Google's text emphasis
-        strikethrough =  'line-through' in tag_emphasis and self.hide_strikethrough
+        strikethrough = 'line-through' in \
+            tag_emphasis and self.hide_strikethrough
         bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
         italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
         fixed = google_fixed_width_font(tag_style) and not \
-                google_fixed_width_font(parent_style) and not self.pre
+            google_fixed_width_font(parent_style) and not self.pre
 
         if start:
             # crossed-out text must be handled before other attributes
@@ -392,7 +423,7 @@ def handle_tag(self, tag, attrs, start):
             parent_style = {}
             if start:
                 if self.tag_stack:
-                  parent_style = self.tag_stack[-1][2]
+                    parent_style = self.tag_stack[-1][2]
                 tag_style = element_style(attrs, self.style_def, parent_style)
                 self.tag_stack.append((tag, attrs, tag_style))
             else:
@@ -404,10 +435,10 @@ def handle_tag(self, tag, attrs, start):
             self.p()
             if start:
                 self.inheader = True
-                self.o(hn(tag)*"#" + ' ')
+                self.o(hn(tag) * "#" + ' ')
             else:
                 self.inheader = False
-                return # prevent redundant emphasis marks on headers
+                return  # prevent redundant emphasis marks on headers
 
         if tag in ['p', 'div']:
             if self.google_doc:
@@ -418,7 +449,8 @@ def handle_tag(self, tag, attrs, start):
             else:
                 self.p()
 
-        if tag == "br" and start: self.o("  \n")
+        if tag == "br" and start:
+            self.o("  \n")
 
         if tag == "hr" and start:
             self.p()
@@ -426,38 +458,47 @@ def handle_tag(self, tag, attrs, start):
             self.p()
 
         if tag in ["head", "style", 'script']:
-            if start: self.quiet += 1
-            else: self.quiet -= 1
+            if start:
+                self.quiet += 1
+            else:
+                self.quiet -= 1
 
         if tag == "style":
-            if start: self.style += 1
-            else: self.style -= 1
+            if start:
+                self.style += 1
+            else:
+                self.style -= 1
 
         if tag in ["body"]:
-            self.quiet = 0 # sites like 9rules.com never close 
+            self.quiet = 0  # sites like 9rules.com never close 
 
         if tag == "blockquote":
             if start:
-                self.p(); self.o('> ', 0, 1); self.start = 1
+                self.p()
+                self.o('> ', 0, 1)
+                self.start = 1
                 self.blockquote += 1
             else:
                 self.blockquote -= 1
                 self.p()
 
-        if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
-        if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
+        if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
+            self.o(self.emphasis_mark)
+        if tag in ['strong', 'b'] and not self.ignore_emphasis:
+            self.o(self.strong_mark)
         if tag in ['del', 'strike', 's']:
             if start:
-                self.o("<"+tag+">")
+                self.o("<" + tag + ">")
             else:
-                self.o("")
+                self.o("")
 
         if self.google_doc:
             if not self.inheader:
                 # handle some font attributes, but leave headers clean
                 self.handle_emphasis(start, tag_style, parent_style)
 
-        if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
+        if tag in ["code", "tt"] and not self.pre:
+            self.o('`')  # TODO: `` `this` ``
         if tag == "abbr":
             if start:
                 self.abbr_title = None
@@ -465,14 +506,16 @@ def handle_tag(self, tag, attrs, start):
                 if ('title' in attrs):
                     self.abbr_title = attrs['title']
             else:
-                if self.abbr_title != None:
+                if self.abbr_title is not None:
                     self.abbr_list[self.abbr_data] = self.abbr_title
                     self.abbr_title = None
                 self.abbr_data = ''
 
         if tag == "a" and not self.ignore_links:
             if start:
-                if ('href' in attrs) and not (self.skip_internal_links and attrs['href'].startswith('#')):
+                if ('href' in attrs) and \
+                        not (self.skip_internal_links and
+                             attrs['href'].startswith('#')):
                     self.astack.append(attrs)
                     self.maybe_automatic_link = attrs['href']
                 else:
@@ -515,10 +558,14 @@ def handle_tag(self, tag, attrs, start):
                         self.a.append(attrs)
                     self.o("[" + str(attrs['count']) + "]")
 
-        if tag == 'dl' and start: self.p()
-        if tag == 'dt' and not start: self.pbr()
-        if tag == 'dd' and start: self.o('    ')
-        if tag == 'dd' and not start: self.pbr()
+        if tag == 'dl' and start:
+            self.p()
+        if tag == 'dt' and not start:
+            self.pbr()
+        if tag == 'dd' and start:
+            self.o('    ')
+        if tag == 'dd' and not start:
+            self.pbr()
 
         if tag in ["ol", "ul"]:
             # Google Docs create sub lists as top level lists
@@ -530,9 +577,13 @@ def handle_tag(self, tag, attrs, start):
                 else:
                     list_style = tag
                 numbering_start = list_numbering_start(attrs)
-                self.list.append({'name':list_style, 'num':numbering_start})
+                self.list.append({
+                    'name': list_style,
+                    'num': numbering_start
+                })
             else:
-                if self.list: self.list.pop()
+                if self.list:
+                    self.list.pop()
             self.lastWasList = True
         else:
             self.lastWasList = False
@@ -540,21 +591,27 @@ def handle_tag(self, tag, attrs, start):
         if tag == 'li':
             self.pbr()
             if start:
-                if self.list: li = self.list[-1]
-                else: li = {'name':'ul', 'num':0}
+                if self.list:
+                    li = self.list[-1]
+                else:
+                    li = {'name': 'ul', 'num': 0}
                 if self.google_doc:
                     nest_count = self.google_nest_count(tag_style)
                 else:
                     nest_count = len(self.list)
-                self.o("  " * nest_count) #TODO: line up 
  1. s > 9 correctly. - if li['name'] == "ul": self.o(self.ul_item_mark + " ") + # TODO: line up
    1. s > 9 correctly. + self.o(" " * nest_count) + if li['name'] == "ul": + self.o(self.ul_item_mark + " ") elif li['name'] == "ol": li['num'] += 1 - self.o(str(li['num'])+". ") + self.o(str(li['num']) + ". ") self.start = 1 - if tag in ["table", "tr"] and start: self.p() - if tag == 'td': self.pbr() + if tag in ["table", "tr"] and start: + self.p() + if tag == 'td': + self.pbr() if tag == "pre": if start: @@ -597,7 +654,8 @@ def o(self, data, puredata=0, force=0): if data and data[0] == ' ': self.space = 1 data = data[1:] - if not data and not force: return + if not data and not force: + return if self.startpre: #self.out(" :") #TODO: not output when already one there @@ -614,12 +672,13 @@ def o(self, data, puredata=0, force=0): #else: list content is already partially indented for i in range(len(self.list)): bq += " " - data = data.replace("\n", "\n"+bq) + data = data.replace("\n", "\n" + bq) if self.startpre: self.startpre = 0 if self.list: - data = data.lstrip("\n") # use existing initial indentation + # use existing initial indentation + data = data.lstrip("\n") if self.start: self.space = 0 @@ -633,25 +692,27 @@ def o(self, data, puredata=0, force=0): self.space = 0 if self.p_p: - self.out((self.br_toggle+'\n'+bq)*self.p_p) + self.out((self.br_toggle + '\n' + bq) * self.p_p) self.space = 0 self.br_toggle = '' if self.space: - if not self.lastWasNL: self.out(' ') + if not self.lastWasNL: + self.out(' ') self.space = 0 if self.a and ((self.p_p == 2 and self.links_each_paragraph) - or force == "end"): - if force == "end": self.out("\n") + or force == "end"): + if force == "end": + self.out("\n") newa = [] for link in self.a: if self.outcount > link['outcount']: - self.out(" ["+ str(link['count']) +"]: " + - urlparse.urljoin(self.baseurl, link['href'])) + self.out(" [" + str(link['count']) + "]: " + + urlparse.urljoin(self.baseurl, link['href'])) if 'title' in link: - self.out(" ("+link['title']+")") + self.out(" (" + link['title'] + ")") self.out("\n") else: newa.append(link) @@ -671,7 +732,8 @@ def o(self, data, puredata=0, force=0): self.outcount += 1 def handle_data(self, data): - if r'\/script>' in data: self.quiet -= 1 + if r'\/script>' in data: + self.quiet -= 1 if self.style: self.style_def.update(dumb_css_parser(data)) @@ -689,10 +751,11 @@ def handle_data(self, data): data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) - def unknown_decl(self, data): pass + def unknown_decl(self, data): + pass def charref(self, name): - if name[0] in ['x','X']: + if name[0] in ['x', 'X']: c = int(name[1:], 16) else: c = int(name) @@ -702,31 +765,35 @@ def charref(self, name): else: try: return unichr(c) - except NameError: #Python3 + except NameError: # Python3 return chr(c) def entityref(self, c): if not self.unicode_snob and c in unifiable.keys(): return unifiable[c] else: - try: name2cp(c) - except KeyError: return "&" + c + ';' + try: + name2cp(c) + except KeyError: + return "&" + c + ';' else: if c == 'nbsp': return unifiable[c] else: try: return unichr(name2cp(c)) - except NameError: #Python3 + except NameError: # Python3 return chr(name2cp(c)) def replaceEntities(self, s): s = s.group(1) if s[0] == "#": return self.charref(s[1:]) - else: return self.entityref(s) + else: + return self.entityref(s) r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") + def unescape(self, s): return self.r_unescape.sub(self.replaceEntities, s) @@ -734,10 +801,10 @@ def google_nest_count(self, style): """calculate the nesting count of google doc lists""" nest_count = 0 if 'margin-left' in style: - nest_count = int(style['margin-left'][:-2]) // self.google_list_indent + nest_count = int(style['margin-left'][:-2]) \ + // self.google_list_indent return nest_count - def optwrap(self, text): """Wrap all paragraphs in the provided text.""" if not self.body_width: @@ -800,46 +867,56 @@ def optwrap(self, text): ''' % re.escape(slash_chars), flags=re.VERBOSE) + def skipwrap(para): - # If the text begins with four spaces or one tab, it's a code block; don't wrap + # If the text begins with four spaces or one tab, it's a code block; + # don't wrap if para[0:4] == ' ' or para[0] == '\t': return True - # If the text begins with only two "--", possibly preceded by whitespace, that's - # an emdash; so wrap. + # If the text begins with only two "--", possibly preceded by + # whitespace, that's an emdash; so wrap. stripped = para.lstrip() if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": return False - # I'm not sure what this is for; I thought it was to detect lists, but there's - # a
      -inside- case in one of the tests that also depends upon it. + # I'm not sure what this is for; I thought it was to detect lists, + # but there's a
      -inside- case in one of the tests that + # also depends upon it. if stripped[0:1] == '-' or stripped[0:1] == '*': return True - # If the text begins with a single -, *, or +, followed by a space, or an integer, - # followed by a ., followed by a space (in either case optionally preceeded by - # whitespace), it's a list; don't wrap. - if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): + # If the text begins with a single -, *, or +, followed by a space, + # or an integer, followed by a ., followed by a space (in either + # case optionally preceeded by whitespace), it's a list; don't wrap. + if ordered_list_matcher.match(stripped) or \ + unordered_list_matcher.match(stripped): return True return False + def wrapwrite(text): text = text.encode('utf-8') - try: #Python3 + try: # Python3 sys.stdout.buffer.write(text) except AttributeError: sys.stdout.write(text) + def html2text(html, baseurl=''): h = HTML2Text(baseurl=baseurl) return h.handle(html) + def unescape(s, unicode_snob=False): h = HTML2Text() h.unicode_snob = unicode_snob return h.unescape(s) + def escape_md(text): - """Escapes markdown-sensitive characters within other markdown constructs.""" + """Escapes markdown-sensitive characters within other markdown + constructs.""" return md_chars_matcher.sub(r"\\\1", text) + def escape_md_section(text, snob=False): """Escapes markdown-sensitive characters across whole document sections.""" text = md_backslash_matcher.sub(r"\\\1", text) @@ -856,26 +933,39 @@ def main(): p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) - p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", - default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") + p.add_option("--ignore-emphasis", dest="ignore_emphasis", + action="store_true", default=IGNORE_EMPHASIS, + help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", - default=IGNORE_ANCHORS, help="don't include any formatting for links") + default=IGNORE_ANCHORS, + help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", - default=IGNORE_IMAGES, help="don't include any formatting for images") + default=IGNORE_IMAGES, + help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", - default=False, help="convert an html-exported Google Document") - p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", - default=False, help="use a dash rather than a star for unordered list items") - p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", - default=False, help="use an asterisk rather than an underscore for emphasized text") - p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", - default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") - p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", - default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") - p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", - default=False, help="hide strike-through text. only relevant when -g is specified as well") + default=False, + help="convert an html-exported Google Document") + p.add_option("-d", "--dash-unordered-list", action="store_true", + dest="ul_style_dash", default=False, + help="use a dash rather than a star for unordered list items") + p.add_option("-e", "--asterisk-emphasis", action="store_true", + dest="em_style_asterisk", default=False, + help="use an asterisk rather than an underscore " + + "for emphasized text") + p.add_option("-b", "--body-width", dest="body_width", action="store", + type="int", default=BODY_WIDTH, + help="number of characters per output line, 0 for no wrap") + p.add_option("-i", "--google-list-indent", dest="list_indent", + action="store", type="int", default=GOOGLE_LIST_INDENT, + help="number of pixels Google indents nested lists") + p.add_option("-s", "--hide-strikethrough", action="store_true", + dest="hide_strikethrough", default=False, + help="hide strike-through text. only relevant when -g is " + + "specified as well") p.add_option("--escape-all", action="store_true", dest="escape_snob", - default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") + default=False, + help="Escape all special characters. Output is less " + + "readable, but avoids corner case formatting issues.") (options, args) = p.parse_args() # process input @@ -913,7 +1003,8 @@ def main(): data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options - if options.ul_style_dash: h.ul_item_mark = '-' + if options.ul_style_dash: + h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' From 5820c6aa90c225915a99915f949244e7f97a2c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 15 Jan 2014 19:29:22 +0100 Subject: [PATCH 007/479] Add test case from the issue #2. I rejected the test_html method, because html2text never actually promised that the output will be correct Markdown, even less it promised it will be Markdown formatted according to some particular way. However, we can try to fix this. --- test/bodywidth_newline.html | 6 ++++++ test/bodywidth_newline.md | 1 + test/test_html2text.py | 12 +++++++++--- 3 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 test/bodywidth_newline.html create mode 100644 test/bodywidth_newline.md diff --git a/test/bodywidth_newline.html b/test/bodywidth_newline.html new file mode 100644 index 00000000..cd5f614b --- /dev/null +++ b/test/bodywidth_newline.html @@ -0,0 +1,6 @@ +

      Another theory is that magician and occultist Aliester Crowley created the beast while + attempting to summon evil spirits at his house on the edge of the + lake in the early 1900′s. I met a local woman who + prefers this explanation.

      diff --git a/test/bodywidth_newline.md b/test/bodywidth_newline.md new file mode 100644 index 00000000..e495c3ee --- /dev/null +++ b/test/bodywidth_newline.md @@ -0,0 +1 @@ +Another theory is that magician and occultist [Aliester Crowley](http://en.wikipedia.org/wiki/Aleister_Crowley) created the beast while attempting to summon evil spirits at his house on the edge of the lake in the early 1900′s. **I met a local woman who prefers this explanation.** diff --git a/test/test_html2text.py b/test/test_html2text.py index 8f227bc0..afae9108 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -4,13 +4,13 @@ import re import subprocess import sys -if sys.version_info[:2] < (2,7): +if sys.version_info[:2] < (2, 7): import unittest2 as unittest else: import unittest import logging logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', - level=logging.DEBUG) + level=logging.DEBUG) import html2text @@ -50,7 +50,8 @@ def test_command(fn, *args): result = get_baseline(fn) pid = subprocess.Popen(cmd, stdout=subprocess.PIPE) out, _ = pid.communicate() - actual = out.decode() + + actual = out.decode('utf8') if os.name == 'nt': # Fix the unwanted CR to CRCRLF replacement @@ -109,6 +110,11 @@ def test_cmd(self): module_args['escape_snob'] = True cmdline_args.append('--escape-all') + if base_fn.startswith('bodywidth'): + #module_args['unicode_snob'] = True + module_args['body_width'] = 0 + cmdline_args.append('--body-width=0') + return test_mod, test_cmd # Originally from http://stackoverflow.com/questions/32899/\ From 1ba191f11d9e0f01e9a271320b8efdaaa271d199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 15 Jan 2014 19:44:20 +0100 Subject: [PATCH 008/479] Add bodywidth paramter instead of module-wide BODY_WIDTH. --- html2text.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html2text.py b/html2text.py index 7922450d..99475868 100755 --- a/html2text.py +++ b/html2text.py @@ -199,14 +199,14 @@ def list_numbering_start(attrs): class HTML2Text(HTMLParser.HTMLParser): - def __init__(self, out=None, baseurl=''): + def __init__(self, out=None, baseurl='', bodywidth=BODY_WIDTH): HTMLParser.HTMLParser.__init__(self) # Config options self.unicode_snob = UNICODE_SNOB self.escape_snob = ESCAPE_SNOB self.links_each_paragraph = LINKS_EACH_PARAGRAPH - self.body_width = BODY_WIDTH + self.body_width = bodywidth self.skip_internal_links = SKIP_INTERNAL_LINKS self.inline_links = INLINE_LINKS self.google_list_indent = GOOGLE_LIST_INDENT @@ -903,8 +903,8 @@ def wrapwrite(text): sys.stdout.write(text) -def html2text(html, baseurl=''): - h = HTML2Text(baseurl=baseurl) +def html2text(html, baseurl='', bodywidth=BODY_WIDTH): + h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) return h.handle(html) From ada62d1366851aa836b43ed794b697d18a3db8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Thu, 16 Jan 2014 12:13:50 +0100 Subject: [PATCH 009/479] =?UTF-8?q?Travis-CI=20doesn=E2=80=99t=20support?= =?UTF-8?q?=202.5=20anymore.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 1 - install_deps.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8b6a6fd3..02b10c83 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.5" - "2.6" - "2.7" - "pypy" diff --git a/install_deps.py b/install_deps.py index 05c04333..51ec53aa 100644 --- a/install_deps.py +++ b/install_deps.py @@ -2,5 +2,5 @@ import sys import subprocess -if sys.version_info[:2] < (2,7): +if sys.version_info[:2] < (2, 7): subprocess.call('pip install unittest2 --use-mirrors', shell=True) From ef5e350b18ea9c86e5516d69022ecae21e68b549 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 8 Mar 2014 19:09:45 +0400 Subject: [PATCH 010/479] Add .idea to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f6059d29..5fa75b9b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ test/*output.md build dist *.egg-info +.idea From b2c2bb7d718d70333f0e2c1b82f3007689c062bd Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 8 Mar 2014 19:13:44 +0400 Subject: [PATCH 011/479] PEP8 on setup.py --- setup.py | 55 +++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index dd3d9bc2..f03fa228 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,35 @@ -import sys from setuptools import setup, find_packages setup( - name = "html2text", - version = "3.200.3", - description = "Turn HTML into equivalent Markdown-structured text.", - author = "Aaron Swartz", - author_email = "me@aaronsw.com", - url='http://www.aaronsw.com/2002/html2text/', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.3', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2' - ], + name="html2text", + version="3.200.3", + description="Turn HTML into equivalent Markdown-structured text.", + author="Aaron Swartz", + author_email="me@aaronsw.com", + url='http://www.aaronsw.com/2002/html2text/', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.3', + 'Programming Language :: Python :: 2.4', + 'Programming Language :: Python :: 2.5', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.0', + 'Programming Language :: Python :: 3.1', + 'Programming Language :: Python :: 3.2' + ], entry_points=""" [console_scripts] html2text=html2text:main """, - license='GNU GPL 3', - packages=find_packages(), - py_modules=['html2text'], - include_package_data=True, - zip_safe=False, + license='GNU GPL 3', + packages=find_packages(), + py_modules=['html2text'], + include_package_data=True, + zip_safe=False, ) From de67aef08101dbf1c6279e28a0941c45f8b8a594 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 9 Mar 2014 01:30:19 +0400 Subject: [PATCH 012/479] Just a simple pep8 here --- setup.py | 63 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/setup.py b/setup.py index 7281e54e..c5c8b8e8 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,17 @@ # coding: utf-8 import sys from setuptools import setup, find_packages, Command + requires_list = [] try: import unittest2 as unittest except ImportError: import unittest else: - if sys.version_info <= (2 , 6): + if sys.version_info <= (2, 6): requires_list.append("unittest2") + class RunTests(Command): """New setup.py command to run all tests for the package. """ @@ -28,38 +30,39 @@ def run(self): runner = unittest.TextTestRunner() runner.run(tests) + setup( - name = "html2text", - version = "3.200.3.1", # Temporarily adding the fourth revision number - description = "Turn HTML into equivalent Markdown-structured text.", - author = "Aaron Swartz", - author_email = "me@aaronsw.com", - url='https://github.com/aaronsw/html2text/', - cmdclass={'test': RunTests}, - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2' - 'Programming Language :: Python :: 3.3' - ], + name="html2text", + version="3.200.3.1", # Temporarily adding the fourth revision number + description="Turn HTML into equivalent Markdown-structured text.", + author="Aaron Swartz", + author_email="me@aaronsw.com", + url='https://github.com/aaronsw/html2text/', + cmdclass={'test': RunTests}, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.4', + 'Programming Language :: Python :: 2.5', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.0', + 'Programming Language :: Python :: 3.1', + 'Programming Language :: Python :: 3.2' + 'Programming Language :: Python :: 3.3' + ], entry_points=""" [console_scripts] html2text=html2text:main """, - license='GNU GPL 3', - requires=requires_list, - packages=find_packages(), - py_modules=['html2text'], - include_package_data=True, - zip_safe=False, + license='GNU GPL 3', + requires=requires_list, + packages=find_packages(), + py_modules=['html2text'], + include_package_data=True, + zip_safe=False, ) From 47ac633b7a5d1c70ea7c942818d594fe3b725975 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 9 Mar 2014 01:31:23 +0400 Subject: [PATCH 013/479] Set the Alir3z4's travis build profile --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0dcb5626..78ea8a7b 100644 --- a/README.md +++ b/README.md @@ -45,4 +45,4 @@ _Originally written by Aaron Swartz. This code is distributed under the GPLv3._ python test/test_html2text.py -v -[![Build Status](https://secure.travis-ci.org/mcepl/html2text.png)](http://travis-ci.org/mcepl/html2text) +[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) From 0695a99ac528ea0c0e36e0f38af71c57545d48ff Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 9 Mar 2014 01:55:39 +0400 Subject: [PATCH 014/479] Add maintainer info --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index c5c8b8e8..6c011f5b 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,8 @@ def run(self): description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", + maintainer='Alireza Savand', + maintainer_email='alireza.savand@gmail.com', url='https://github.com/aaronsw/html2text/', cmdclass={'test': RunTests}, classifiers=[ From 2931c4ed5bc995a04ffde14f81a83a6f498b8170 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 9 Mar 2014 01:56:59 +0400 Subject: [PATCH 015/479] Add platform meta data --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 6c011f5b..15ddaad5 100644 --- a/setup.py +++ b/setup.py @@ -41,10 +41,12 @@ def run(self): maintainer_email='alireza.savand@gmail.com', url='https://github.com/aaronsw/html2text/', cmdclass={'test': RunTests}, + platforms='OS Independent', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.4', From 6f5a8224c0495fcd57eedc8697794900b413433a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:03:09 +0400 Subject: [PATCH 016/479] Add ChangeLog.rst file. --- ChangeLog.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 ChangeLog.rst diff --git a/ChangeLog.rst b/ChangeLog.rst new file mode 100644 index 00000000..77eb3554 --- /dev/null +++ b/ChangeLog.rst @@ -0,0 +1,5 @@ +0000.0.0 +======== +---- + +* Fix #1: Add ChangeLog.rst file. From 7c6d78c175eb5de2a2d86ca5386dcf6cf11929e3 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:14:33 +0400 Subject: [PATCH 017/479] Add AUTHORS.rst file. --- AUTHORS.rst | 14 ++++++++++++++ ChangeLog.rst | 3 ++- 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 AUTHORS.rst diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 00000000..c7ea1ce3 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,14 @@ +``html2text`` was Originally written by Aaron Swartz. + +The AUTHORS are (and/or have been): + + * Aaron Swartz + * Alireza Savand + * Yariv Barkan + * Alex Musayev + * Matěj Cepl + * Stefano Rivera + +Maintainer: + + * Alireza Savand diff --git a/ChangeLog.rst b/ChangeLog.rst index 77eb3554..088b8c85 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,4 +2,5 @@ ======== ---- -* Fix #1: Add ChangeLog.rst file. +* Fix #1: Add ``ChangeLog.rst`` file. +* Fix #2: Add ``AUTHORS.rst`` file. From 05feed1e22e65ebad825d0f7acba32ef6971e58b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:15:03 +0400 Subject: [PATCH 018/479] Add changelog.rst and authors.rst file to manifest.in --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 2301d362..1845310c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ include COPYING include README.md +include ChangeLog.rst +include AUTHORS.rst From 104059506bca9814b8aa2441a865f56a6dd60eac Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:18:10 +0400 Subject: [PATCH 019/479] Update to Alir3z4's account --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 15ddaad5..5b778eb2 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ def run(self): author_email="me@aaronsw.com", maintainer='Alireza Savand', maintainer_email='alireza.savand@gmail.com', - url='https://github.com/aaronsw/html2text/', + url='https://github.com/Alir3z4/html2text/', cmdclass={'test': RunTests}, platforms='OS Independent', classifiers=[ From 567a795accc94d7c42a01bf5b5bede3bf6e3d41b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:19:55 +0400 Subject: [PATCH 020/479] Bump version number --- html2text.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text.py b/html2text.py index 99475868..6e79b43f 100755 --- a/html2text.py +++ b/html2text.py @@ -2,7 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division -__version__ = "3.200.3" +__version__ = "2014.4.5" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", diff --git a/setup.py b/setup.py index 5b778eb2..3de8b4df 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def run(self): setup( name="html2text", - version="3.200.3.1", # Temporarily adding the fourth revision number + version="2014.4.5", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 878b147a7b23bd10ce4433b36f74691610ad282f Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:25:01 +0400 Subject: [PATCH 021/479] Add "How to install" section to readme --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 78ea8a7b..e1add60b 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,16 @@ Or with some configuration options: _Originally written by Aaron Swartz. This code is distributed under the GPLv3._ +## How to install + +`html2text` is available on pypi +https://pypi.python.org/pypi/html2text + +``` +$ pip install html2text +``` + + ## How to do a release 1. Update the version in `html2text.py` From 79db293c34d1ed4948b3ba5e4710432ed366e1eb Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:25:20 +0400 Subject: [PATCH 022/479] Move travis-ci build status image to top of the readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e1add60b..229e3e5c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # [html2text](http://www.aaronsw.com/2002/html2text/) +[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) + + html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). Usage: `html2text.py [(filename|url) [encoding]]` @@ -54,5 +57,3 @@ $ pip install html2text ## How to run unit tests python test/test_html2text.py -v - -[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) From a352b55187a8d8dd241384b44ff8ffb7b74e192b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:26:50 +0400 Subject: [PATCH 023/479] Update version number to today for the release --- ChangeLog.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 088b8c85..56adc71a 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,5 +1,5 @@ -0000.0.0 -======== +2014.4.5 - 2014-04-05 +===================== ---- * Fix #1: Add ``ChangeLog.rst`` file. From 0fc7276a70b00769b23d622c7ffa69529a2d9069 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Apr 2014 18:28:20 +0400 Subject: [PATCH 024/479] Fix classifier --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3de8b4df..3b7085de 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ def run(self): 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.0', 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2' + 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3' ], entry_points=""" From 1d9d9bb4ede0474acc19ccd5d097f2448da22b5b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 10 Apr 2014 02:42:11 +0400 Subject: [PATCH 025/479] Remove `How to do a release` section from README.md I don't think people need to do that usually, --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index 229e3e5c..f1208afd 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,6 @@ $ pip install html2text ``` -## How to do a release - -1. Update the version in `html2text.py` -2. Update the version in `setup.py` -3. Run `python setup.py sdist upload` - ## How to run unit tests python test/test_html2text.py -v From 3232f42dc2d0d1ae43000919e9b7668526153081 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 10 Apr 2014 02:43:32 +0400 Subject: [PATCH 026/479] Update the changlog for fixing #8 --- ChangeLog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 56adc71a..40bbf38b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +0000.0.0 - 0000-00-00 +===================== +---- + +* Fix #8: Remove ``How to do a release`` section from README.md. + + 2014.4.5 - 2014-04-05 ===================== ---- From 082e6112b4b60dab981fdc90e4830e71648345a3 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Tue, 22 Apr 2014 20:30:43 +0400 Subject: [PATCH 027/479] Include test directory markdown, html files. --- ChangeLog.rst | 1 + MANIFEST.in | 1 + 2 files changed, 2 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 40bbf38b..17442458 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ ---- * Fix #8: Remove ``How to do a release`` section from README.md. +* Fix #11: Include test directory markdown, html files. 2014.4.5 - 2014-04-05 diff --git a/MANIFEST.in b/MANIFEST.in index 1845310c..fe59ca1d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,3 +2,4 @@ include COPYING include README.md include ChangeLog.rst include AUTHORS.rst +recursive-include test *.html *.md From cefe76cf31f5e01f0da6ffdf44c13be55f829efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sz=C3=A9pe=20Viktor?= Date: Sat, 24 May 2014 14:57:51 +0200 Subject: [PATCH 028/479] handle undefined href attribute --- html2text.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/html2text.py b/html2text.py index 6e79b43f..59fc1e41 100755 --- a/html2text.py +++ b/html2text.py @@ -517,8 +517,9 @@ def handle_tag(self, tag, attrs, start): if tag == "a" and not self.ignore_links: if start: if ('href' in attrs) and \ - not (self.skip_internal_links and - attrs['href'].startswith('#')): + (attrs['href'] is not None) and \ + not (self.skip_internal_links and + attrs['href'].startswith('#')): self.astack.append(attrs) self.maybe_automatic_link = attrs['href'] else: From 5bcb656541a2c12cec16147ababf40a18f64b6e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sz=C3=A9pe=20Viktor?= Date: Sat, 24 May 2014 17:27:59 +0200 Subject: [PATCH 029/479] the html --- anchor-undefined-href.html | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 anchor-undefined-href.html diff --git a/anchor-undefined-href.html b/anchor-undefined-href.html new file mode 100644 index 00000000..20197513 --- /dev/null +++ b/anchor-undefined-href.html @@ -0,0 +1,5 @@ + + + anchor + + From eff61949743c588734fdaa2647b899e0cef430e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sz=C3=A9pe=20Viktor?= Date: Sat, 24 May 2014 17:30:09 +0200 Subject: [PATCH 030/479] the markdown --- test/anchor-undefined-href.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 test/anchor-undefined-href.md diff --git a/test/anchor-undefined-href.md b/test/anchor-undefined-href.md new file mode 100644 index 00000000..12e6c627 --- /dev/null +++ b/test/anchor-undefined-href.md @@ -0,0 +1,2 @@ +anchor + From e0cf94ec35c7a9cc393cf761b86dd8994d6d0059 Mon Sep 17 00:00:00 2001 From: Arfrever Date: Thu, 29 May 2014 18:07:21 +0200 Subject: [PATCH 031/479] Don't install tests (but include them in PyPI package). Fixes #9 --- MANIFEST.in | 1 + setup.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 1845310c..6217360c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,3 +2,4 @@ include COPYING include README.md include ChangeLog.rst include AUTHORS.rst +recursive-include test *.html *.md *.py diff --git a/setup.py b/setup.py index 3b7085de..0b2cf08a 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,6 @@ def run(self): """, license='GNU GPL 3', requires=requires_list, - packages=find_packages(), py_modules=['html2text'], include_package_data=True, zip_safe=False, From f91a6aa2928378e89b8cdceec2f4ada5abfd9801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 2 Jul 2014 08:03:22 +0200 Subject: [PATCH 032/479] Don't bring value of previous calls of handle() to the new one. Fix #13 --- html2text.py | 11 ++++++++++- test/test_memleak.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 test/test_memleak.py diff --git a/html2text.py b/html2text.py index 59fc1e41..f1e39849 100755 --- a/html2text.py +++ b/html2text.py @@ -283,10 +283,15 @@ def outtextf(self, s): def close(self): HTMLParser.HTMLParser.close(self) + try: + nochr = unicode('') + except NameError: + nochr = str('') + self.pbr() self.o('', 0, 'end') - self.outtext = self.outtext.join(self.outtextlist) + self.outtext = nochr.join(self.outtextlist) if self.unicode_snob: try: nbsp = unichr(name2cp('nbsp')) @@ -303,6 +308,10 @@ def close(self): except NameError: self.outtext = self.outtext.replace(' _place_holder;', nbsp) + # Clear self.outtextlist to avoid memory leak of its content to + # the next handling. + self.outtextlist = [] + return self.outtext def handle_charref(self, c): diff --git a/test/test_memleak.py b/test/test_memleak.py new file mode 100644 index 00000000..fd2ae18e --- /dev/null +++ b/test/test_memleak.py @@ -0,0 +1,32 @@ +import sys +if sys.version_info[:2] < (2, 7): + import unittest2 as unittest +else: + import unittest +import logging +logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', + level=logging.DEBUG) + +import html2text + + +class TestMemleak(unittest.TestCase): + """ + See https://github.com/Alir3z4/html2text/issues/13 for more + information on this. + """ + + def setUp(self): + self.instr = 'miow ' + + def test_same_string(self): + h2t = html2text.HTML2Text() + result = h2t.handle(self.instr) + # Now, we shouldn't get leak of the previous run to the new one + self.assertEqual(h2t.handle(self.instr), result) + + def test_empty_string(self): + h2t = html2text.HTML2Text() + h2t.handle(self.instr) + # And even less when the input is empty + self.assertEqual(h2t.handle(''), u'\n\n') From 4ef61c52ee6b644695f8a617e75d1c7ae9f0b153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 2 Jul 2014 19:13:32 +0200 Subject: [PATCH 033/479] Remove self.drop_white_space and make self.outtext into local variable. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I am persuaded that self.drop_white_space is completely dead code, which never makes any change on anything (self.outtext is in the moment when it is run always empty string), so its removal doesn’t influence anything. Therefore, it is possible to make outtext in close() method into a local variable, which makes whole situation with memory leaks way more simple. --- html2text.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/html2text.py b/html2text.py index f1e39849..e0fcec87 100755 --- a/html2text.py +++ b/html2text.py @@ -226,11 +226,6 @@ def __init__(self, out=None, baseurl='', bodywidth=BODY_WIDTH): # empty list to store output characters before they are "joined" self.outtextlist = [] - try: - self.outtext = unicode() - except NameError: # Python3 - self.outtext = str() - self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 @@ -291,7 +286,7 @@ def close(self): self.pbr() self.o('', 0, 'end') - self.outtext = nochr.join(self.outtextlist) + outtext = nochr.join(self.outtextlist) if self.unicode_snob: try: nbsp = unichr(name2cp('nbsp')) @@ -303,16 +298,15 @@ def close(self): except NameError: nbsp = chr(32) try: - self.outtext = self.outtext.replace(unicode(' _place_holder;'), - nbsp) + outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: - self.outtext = self.outtext.replace(' _place_holder;', nbsp) + outtext = outtext.replace(' _place_holder;', nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. self.outtextlist = [] - return self.outtext + return outtext def handle_charref(self, c): self.o(self.charref(c), 1) @@ -351,10 +345,6 @@ def previousIndex(self, attrs): if match: return i - def drop_last(self, nLetters): - if not self.quiet: - self.outtext = self.outtext[:-nLetters] - def handle_emphasis(self, start, tag_style, parent_style): """handles various text emphases""" tag_emphasis = google_text_emphasis(tag_style) @@ -394,7 +384,6 @@ def handle_emphasis(self, start, tag_style, parent_style): if fixed: if self.drop_white_space: # empty emphasis, drop it - self.drop_last(1) self.drop_white_space -= 1 else: self.o('`') @@ -402,14 +391,12 @@ def handle_emphasis(self, start, tag_style, parent_style): if bold: if self.drop_white_space: # empty emphasis, drop it - self.drop_last(2) self.drop_white_space -= 1 else: self.o(self.strong_mark) if italic: if self.drop_white_space: # empty emphasis, drop it - self.drop_last(1) self.drop_white_space -= 1 else: self.o(self.emphasis_mark) From d8a3cf5a6d2c2523d8b70d4f6842c13968f1c8ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 2 Jul 2014 08:07:56 +0200 Subject: [PATCH 034/479] More PEP8ization and added some docstrings Also, removed (apparently) useless change of self.outtext on line 381 (self.outtext shouldn't be even initialized at that point) --- html2text.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/html2text.py b/html2text.py index e0fcec87..3ace69ed 100755 --- a/html2text.py +++ b/html2text.py @@ -200,6 +200,12 @@ def list_numbering_start(attrs): class HTML2Text(HTMLParser.HTMLParser): def __init__(self, out=None, baseurl='', bodywidth=BODY_WIDTH): + """ + Input parameters: + out: possible custom replacement for self.outtextf (which + appends lines of text). + baseurl: base URL of the document we process + """ HTMLParser.HTMLParser.__init__(self) # Config options @@ -380,7 +386,6 @@ def handle_emphasis(self, start, tag_style, parent_style): # there must not be whitespace before closing emphasis mark self.emphasis -= 1 self.space = 0 - self.outtext = self.outtext.rstrip() if fixed: if self.drop_white_space: # empty emphasis, drop it @@ -514,8 +519,8 @@ def handle_tag(self, tag, attrs, start): if start: if ('href' in attrs) and \ (attrs['href'] is not None) and \ - not (self.skip_internal_links and - attrs['href'].startswith('#')): + not (self.skip_internal_links and + attrs['href'].startswith('#')): self.astack.append(attrs) self.maybe_automatic_link = attrs['href'] else: @@ -633,6 +638,9 @@ def soft_br(self): self.br_toggle = ' ' def o(self, data, puredata=0, force=0): + """ + Deal with indentation and whitespace + """ if self.abbr_data is not None: self.abbr_data += data From 0b679d15daff75ecfd501c4eed7caabd48564eca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 2 Jul 2014 11:29:27 +0200 Subject: [PATCH 035/479] True/False are built-in constants since 2.3. Really? --- html2text.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/html2text.py b/html2text.py index 3ace69ed..f7f25915 100755 --- a/html2text.py +++ b/html2text.py @@ -11,13 +11,6 @@ # TODO: # Support decoded entities with unifiable. -try: - True -except NameError: - setattr(__builtins__, 'True', 1) - setattr(__builtins__, 'False', 0) - - try: import htmlentitydefs import urlparse From d1c591bbaf16e30a0c5dd7374bda62089da4a778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Wed, 2 Jul 2014 11:10:46 +0200 Subject: [PATCH 036/479] Add coverageall to .travis.yml --- .travis.yml | 9 +++++++-- README.md | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 02b10c83..79639e30 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,11 @@ python: - "pypy" - "3.2" - "3.3" +install: + - pip install coveralls before_script: - - "python install_deps.py" -script: PYTHONPATH=$PYTHONPATH:. python test/test_html2text.py -v + - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' +script: + PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v +after_success: + coveralls diff --git a/README.md b/README.md index 229e3e5c..2e418f41 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # [html2text](http://www.aaronsw.com/2002/html2text/) -[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) +[![Build Status](https://secure.travis-ci.org/html2text/html2text.png)](http://travis-ci.org/html2text/html2text) +[![Coverage Status](https://coveralls.io/repos/htmL2text/html2text/badge.png)](https://coveralls.io/r/htmL2text/html2text) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From 06c2d4e9cafe060a08b7ceb2c243bfb5b2fe2544 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 21:59:54 +0400 Subject: [PATCH 037/479] Update changelog --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 17442458..12cca7fb 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,6 +4,7 @@ * Fix #8: Remove ``How to do a release`` section from README.md. * Fix #11: Include test directory markdown, html files. +* Fix memory leak in using ``handle`` while keeping the old instance of ``html2text``. 2014.4.5 - 2014-04-05 From 9bc91a85e6ffc8170caf3044289cffe2653ade1c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:03:26 +0400 Subject: [PATCH 038/479] Remove the weirdness --- AUTHORS.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index c7ea1ce3..b62454d1 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -3,7 +3,6 @@ The AUTHORS are (and/or have been): * Aaron Swartz - * Alireza Savand * Yariv Barkan * Alex Musayev * Matěj Cepl From 20f87643f7af660566729f6c4c1e3b8f67908b12 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:03:42 +0400 Subject: [PATCH 039/479] Fix copy typo --- html2text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text.py b/html2text.py index f7f25915..8439019c 100755 --- a/html2text.py +++ b/html2text.py @@ -116,7 +116,7 @@ def dumb_css_parser(data): data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] importIndex = data.find('@import') - # parse the css. reverted from dictionary compehension in order to + # parse the css. reverted from dictionary comprehension in order to # support older pythons elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] try: @@ -886,7 +886,7 @@ def skipwrap(para): return True # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either - # case optionally preceeded by whitespace), it's a list; don't wrap. + # case optionally proceeded by whitespace), it's a list; don't wrap. if ordered_list_matcher.match(stripped) or \ unordered_list_matcher.match(stripped): return True From 7cd5f309807c0f61f54a2ad5a1af16df77edc7fa Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:04:00 +0400 Subject: [PATCH 040/479] Remove the prs since it's unnessary --- html2text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text.py b/html2text.py index 8439019c..578c51dd 100755 --- a/html2text.py +++ b/html2text.py @@ -538,7 +538,7 @@ def handle_tag(self, tag, attrs, start): self.o("][" + str(a['count']) + "]") if tag == "img" and start and not self.ignore_images: - if ('src' in attrs): + if 'src' in attrs: attrs['href'] = attrs['src'] alt = attrs.get('alt', '') self.o("![" + escape_md(alt) + "]") From a1a66d70d2b154b81e5708002caf5506ea8cf262 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:08:40 +0400 Subject: [PATCH 041/479] Remove anchor-undefined-href to test module --- anchor-undefined-href.html => test/anchor-undefined-href.html | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename anchor-undefined-href.html => test/anchor-undefined-href.html (100%) diff --git a/anchor-undefined-href.html b/test/anchor-undefined-href.html similarity index 100% rename from anchor-undefined-href.html rename to test/anchor-undefined-href.html From 7bb3e60658248262a769c554ee420efc231481f8 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:09:36 +0400 Subject: [PATCH 042/479] Update version number --- ChangeLog.rst | 2 +- html2text.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 12cca7fb..09e12ea5 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.0.0 - 0000-00-00 +2014.7.3 - 2014-07-03 ===================== ---- diff --git a/html2text.py b/html2text.py index 578c51dd..dd82b63d 100755 --- a/html2text.py +++ b/html2text.py @@ -2,7 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division -__version__ = "2014.4.5" +__version__ = "2014.7.3" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", From bdbc9ff55ec4595d7a0dcd57fbdccaa5bf6b4ed5 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:10:26 +0400 Subject: [PATCH 043/479] Ignore .coverage --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5fa75b9b..0eaa5553 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ build dist *.egg-info .idea +.coverage From aaa52040d00115fab5a6a6e4672217f9c24ff7b0 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 3 Jul 2014 22:24:17 +0400 Subject: [PATCH 044/479] bump the version in setup.py this should be from __init__ file not hardcoded --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0b2cf08a..c696851e 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def run(self): setup( name="html2text", - version="2014.4.5", + version="2014.7.3", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 96c2b096dedff6b32996c23fdc9e333c5c6c34af Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:06:30 +0400 Subject: [PATCH 045/479] Options in table --- README.md | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ac142837..a51c1f66 100644 --- a/README.md +++ b/README.md @@ -6,23 +6,22 @@ html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). + Usage: `html2text.py [(filename|url) [encoding]]` - Options: - --version show program's version number and exit - -h, --help show this help message and exit - --ignore-links don't include any formatting for links - --ignore-images don't include any formatting for images - -g, --google-doc convert an html-exported Google Document - -d, --dash-unordered-list - use a dash rather than a star for unordered list items - -b BODY_WIDTH, --body-width=BODY_WIDTH - number of characters per output line, 0 for no wrap - -i LIST_INDENT, --google-list-indent=LIST_INDENT - number of pixels Google indents nested lists - -s, --hide-strikethrough - hide strike-through text. only relevent when -g is - specified as well + +| Option | Description +|--------------------------------------------------------|-------------------------------------------------- +| `--version` | Show program's version number and exit +| `-h`, `--help` | Show this help message and exit +| `--ignore-links` | Don't include any formatting for links +|`--ignore-images` | Don't include any formatting for images +|`-g`, `--google-doc` | Convert an html-exported Google Document +|`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items +|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap +|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists +|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well + Or you can use it from within Python: From 4118a447488b3995b38b2eacdde5348ac31d5f04 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:08:46 +0400 Subject: [PATCH 046/479] Some markdown eye candy job --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a51c1f66..7b640b75 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Usage: `html2text.py [(filename|url) [encoding]]` |`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well -Or you can use it from within Python: +Or you can use it from within `Python`: import html2text print html2text.html2text("

      Hello, world.

      ") From 1fa339cfb6496c0323da9c187230908a8b3971f2 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:09:58 +0400 Subject: [PATCH 047/479] Fix running tests command Without this I would get an error on any machine :| --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7b640b75..ddcb745e 100644 --- a/README.md +++ b/README.md @@ -50,4 +50,4 @@ $ pip install html2text ## How to run unit tests - python test/test_html2text.py -v + PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v From 53c683b06e35b309fefaceeedcd09365259b8877 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:53:45 +0400 Subject: [PATCH 048/479] Remove meta vars --- html2text.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/html2text.py b/html2text.py index dd82b63d..0e98fec6 100755 --- a/html2text.py +++ b/html2text.py @@ -2,11 +2,10 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division + + __version__ = "2014.7.3" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", - "Kevin Jay North", "Matěj Cepl"] + # TODO: # Support decoded entities with unifiable. From 8e05335c5d914792ce08a9289f431cf39cd90b6a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:54:57 +0400 Subject: [PATCH 049/479] Explicit except --- html2text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text.py b/html2text.py index 0e98fec6..306cd5a5 100755 --- a/html2text.py +++ b/html2text.py @@ -20,7 +20,7 @@ import html.parser as HTMLParser try: # Python3 import urllib.request as urllib -except: +except ImportError: import urllib import optparse import re @@ -28,7 +28,7 @@ try: from textwrap import wrap -except: +except ImportError: pass # Use Unicode characters instead of their ascii psuedo-replacements From ca7354b5be2b18bb91a2a26d62efef72e22de570 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:59:31 +0400 Subject: [PATCH 050/479] Add docstring to dumb_property_dict --- html2text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text.py b/html2text.py index 306cd5a5..d2dbce2a 100755 --- a/html2text.py +++ b/html2text.py @@ -98,7 +98,8 @@ def hn(tag): def dumb_property_dict(style): - """returns a hash of css attributes""" + """ + :returns: A hash of css attributes""" out = dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]) From 939f8eddd105570271503b127bb7b2bcf6e06b81 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:59:39 +0400 Subject: [PATCH 051/479] Add docstring to dumb_css_parser --- html2text.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/html2text.py b/html2text.py index d2dbce2a..f70e6362 100755 --- a/html2text.py +++ b/html2text.py @@ -107,8 +107,13 @@ def dumb_property_dict(style): def dumb_css_parser(data): - """returns a hash of css selectors, each of which contains a hash of - css attributes""" + """ + :type data: str + + :returns: A hash of css selectors, each of which contains a hash of + css attributes. + :rtype: dict + """ # remove @import sentences data += ';' importIndex = data.find('@import') From 0c20cf12ef95906f9e595ccfb5c06673c9c0d440 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 10:59:50 +0400 Subject: [PATCH 052/479] Add docstring to element_style --- html2text.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/html2text.py b/html2text.py index f70e6362..a873cbda 100755 --- a/html2text.py +++ b/html2text.py @@ -134,7 +134,14 @@ def dumb_css_parser(data): def element_style(attrs, style_def, parent_style): - """returns a hash of the 'final' style attributes of the element""" + """ + :type attrs: dict + :type style_def: dict + :type style_def: dict + + :returns: A hash of the 'final' style attributes of the element + :rtype: dict + """ style = parent_style.copy() if 'class' in attrs: for css_class in attrs['class'].split(): From 7eb2e4eba74129b29c0ce1c5d70774228099f2a0 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:09:17 +0400 Subject: [PATCH 053/479] Move html2text to separate python package --- html2text.py => html2text/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename html2text.py => html2text/__init__.py (100%) mode change 100755 => 100644 diff --git a/html2text.py b/html2text/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from html2text.py rename to html2text/__init__.py From 60c3709f858b0694441fdfbd1e2390d02207cb3d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:09:30 +0400 Subject: [PATCH 054/479] Fix cmd_name --- test/test_html2text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index afae9108..28667fcf 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -35,7 +35,7 @@ def test_module(fn, google_doc=False, **kwargs): def test_command(fn, *args): args = list(args) - cmd_name = os.path.join(os.path.dirname(fn), '..', 'html2text.py') + cmd_name = os.path.join(os.path.dirname(fn), '..', 'html2text/__init__.py') cmd = [sys.executable, cmd_name] if '--googledoc' in args: From c2da06b05d25c195eeb3112f0e76f1495abc8989 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:10:23 +0400 Subject: [PATCH 055/479] Tiny pep8 --- test/test_html2text.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_html2text.py b/test/test_html2text.py index 28667fcf..929a6afd 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -4,16 +4,19 @@ import re import subprocess import sys + if sys.version_info[:2] < (2, 7): import unittest2 as unittest else: import unittest import logging + logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) import html2text + def test_module(fn, google_doc=False, **kwargs): h = html2text.HTML2Text() h.fn = fn @@ -33,6 +36,7 @@ def test_module(fn, google_doc=False, **kwargs): inf.close() return result, actual + def test_command(fn, *args): args = list(args) cmd_name = os.path.join(os.path.dirname(fn), '..', 'html2text/__init__.py') @@ -61,12 +65,15 @@ def test_command(fn, *args): return result, actual + def get_dump_name(fn, suffix): return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix) + def get_baseline_name(fn): return os.path.splitext(fn)[0] + '.md' + def get_baseline(fn): name = get_baseline_name(fn) f = codecs.open(name, mode='r', encoding='utf8') @@ -74,9 +81,11 @@ def get_baseline(fn): f.close() return out + class TestHTML2Text(unittest.TestCase): pass + def generate_test(fn): def test_mod(self): self.maxDiff = None From 02d126c0260692338ce9075e0711e0967c960f4e Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:19:47 +0400 Subject: [PATCH 056/479] Move const and conf vars to config.py module --- html2text/__init__.py | 43 ----------------------------------------- html2text/config.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 43 deletions(-) create mode 100644 html2text/config.py diff --git a/html2text/__init__.py b/html2text/__init__.py index a873cbda..8f7ef281 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -31,55 +31,12 @@ except ImportError: pass -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Escape all special characters. Output is less readable, but avoids -# corner case formatting issues. -ESCAPE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 - -# Don't show internal links (href="#local-anchor") -- corresponding link -# targets won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = True - -# Use inline, rather than reference, formatting for images and links -INLINE_LINKS = True - -# Number of pixels Google indents nested lists -GOOGLE_LIST_INDENT = 36 - -IGNORE_ANCHORS = False -IGNORE_IMAGES = False -IGNORE_EMPHASIS = False - -### Entity Nonsense ### -# For checking space-only lines on line 771 -SPACE_RE = re.compile(r'\s\+') - def name2cp(k): if k == 'apos': return ord("'") return htmlentitydefs.name2codepoint[k] - -unifiable = {'rsquo': "'", 'lsquo': "'", 'rdquo': '"', 'ldquo': '"', - 'copy': '(C)', 'mdash': '--', 'nbsp': ' ', 'rarr': '->', - 'larr': '<-', 'middot': '*', 'ndash': '-', 'oelig': 'oe', - 'aelig': 'ae', 'agrave': 'a', 'aacute': 'a', 'acirc': 'a', - 'atilde': 'a', 'auml': 'a', 'aring': 'a', 'egrave': 'e', - 'eacute': 'e', 'ecirc': 'e', 'euml': 'e', 'igrave': 'i', - 'iacute': 'i', 'icirc': 'i', 'iuml': 'i', 'ograve': 'o', - 'oacute': 'o', 'ocirc': 'o', 'otilde': 'o', 'ouml': 'o', - 'ugrave': 'u', 'uacute': 'u', 'ucirc': 'u', 'uuml': 'u', - 'lrm': '', 'rlm': ''} - unifiable_n = {} for k in unifiable.keys(): diff --git a/html2text/config.py b/html2text/config.py new file mode 100644 index 00000000..2d056726 --- /dev/null +++ b/html2text/config.py @@ -0,0 +1,45 @@ +# Use Unicode characters instead of their ascii psuedo-replacements +import re + +UNICODE_SNOB = 0 + +# Escape all special characters. Output is less readable, but avoids +# corner case formatting issues. +ESCAPE_SNOB = 0 + +# Put the links after each paragraph instead of at the end. +LINKS_EACH_PARAGRAPH = 0 + +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +BODY_WIDTH = 78 + +# Don't show internal links (href="#local-anchor") -- corresponding link +# targets won't be visible in the plain text file anyway. +SKIP_INTERNAL_LINKS = True + +# Use inline, rather than reference, formatting for images and links +INLINE_LINKS = True + +# Number of pixels Google indents nested lists +GOOGLE_LIST_INDENT = 36 + +IGNORE_ANCHORS = False +IGNORE_IMAGES = False +IGNORE_EMPHASIS = False + +### Entity Nonsense ### +# For checking space-only lines on line 771 +SPACE_RE = re.compile(r'\s\+') + + + +unifiable = {'rsquo': "'", 'lsquo': "'", 'rdquo': '"', 'ldquo': '"', + 'copy': '(C)', 'mdash': '--', 'nbsp': ' ', 'rarr': '->', + 'larr': '<-', 'middot': '*', 'ndash': '-', 'oelig': 'oe', + 'aelig': 'ae', 'agrave': 'a', 'aacute': 'a', 'acirc': 'a', + 'atilde': 'a', 'auml': 'a', 'aring': 'a', 'egrave': 'e', + 'eacute': 'e', 'ecirc': 'e', 'euml': 'e', 'igrave': 'i', + 'iacute': 'i', 'icirc': 'i', 'iuml': 'i', 'ograve': 'o', + 'oacute': 'o', 'ocirc': 'o', 'otilde': 'o', 'ouml': 'o', + 'ugrave': 'u', 'uacute': 'u', 'ucirc': 'u', 'uuml': 'u', + 'lrm': '', 'rlm': ''} From 192ca940f8f65b4489d46ae523e9689fd1ca0366 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:20:21 +0400 Subject: [PATCH 057/479] Using config module --- html2text/__init__.py | 48 ++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 8f7ef281..985967c5 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -2,6 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division +from html2text import config __version__ = "2014.7.3" @@ -39,8 +40,9 @@ def name2cp(k): unifiable_n = {} -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] +for k in config.unifiable.keys(): + unifiable_n[name2cp(k)] = config.unifiable[k] + ### End Entity Nonsense ### @@ -161,7 +163,7 @@ def list_numbering_start(attrs): class HTML2Text(HTMLParser.HTMLParser): - def __init__(self, out=None, baseurl='', bodywidth=BODY_WIDTH): + def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): """ Input parameters: out: possible custom replacement for self.outtextf (which @@ -171,16 +173,16 @@ def __init__(self, out=None, baseurl='', bodywidth=BODY_WIDTH): HTMLParser.HTMLParser.__init__(self) # Config options - self.unicode_snob = UNICODE_SNOB - self.escape_snob = ESCAPE_SNOB - self.links_each_paragraph = LINKS_EACH_PARAGRAPH + self.unicode_snob = config.UNICODE_SNOB + self.escape_snob = config.ESCAPE_SNOB + self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH self.body_width = bodywidth - self.skip_internal_links = SKIP_INTERNAL_LINKS - self.inline_links = INLINE_LINKS - self.google_list_indent = GOOGLE_LIST_INDENT - self.ignore_links = IGNORE_ANCHORS - self.ignore_images = IGNORE_IMAGES - self.ignore_emphasis = IGNORE_EMPHASIS + self.skip_internal_links = config.SKIP_INTERNAL_LINKS + self.inline_links = config.INLINE_LINKS + self.google_list_indent = config.GOOGLE_LIST_INDENT + self.ignore_links = config.IGNORE_ANCHORS + self.ignore_images = config.IGNORE_IMAGES + self.ignore_emphasis = config.IGNORE_EMPHASIS self.google_doc = False self.ul_item_mark = '*' self.emphasis_mark = '_' @@ -227,7 +229,7 @@ def __init__(self, out=None, baseurl='', bodywidth=BODY_WIDTH): del unifiable_n[name2cp('nbsp')] except KeyError: pass - unifiable['nbsp'] = ' _place_holder;' + config.unifiable['nbsp'] = ' _place_holder;' def feed(self, data): data = data.replace("", "") @@ -739,8 +741,8 @@ def charref(self, name): return chr(c) def entityref(self, c): - if not self.unicode_snob and c in unifiable.keys(): - return unifiable[c] + if not self.unicode_snob and c in config.unifiable.keys(): + return config.unifiable[c] else: try: name2cp(c) @@ -748,7 +750,7 @@ def entityref(self, c): return "&" + c + ';' else: if c == 'nbsp': - return unifiable[c] + return config.unifiable[c] else: try: return unichr(name2cp(c)) @@ -798,7 +800,7 @@ def optwrap(self, text): # Be aware that obvious replacement of this with # line.isspace() # DOES NOT work! Explanations are welcome. - if not SPACE_RE.match(para): + if not config.SPACE_RE.match(para): result += para + "\n" newlines = 1 else: @@ -870,7 +872,7 @@ def wrapwrite(text): sys.stdout.write(text) -def html2text(html, baseurl='', bodywidth=BODY_WIDTH): +def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) return h.handle(html) @@ -904,13 +906,13 @@ def main(): p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", - action="store_true", default=IGNORE_EMPHASIS, + action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", - default=IGNORE_ANCHORS, + default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--ignore-images", dest="ignore_images", action="store_true", - default=IGNORE_IMAGES, + default=config.IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, @@ -923,10 +925,10 @@ def main(): help="use an asterisk rather than an underscore " + "for emphasized text") p.add_option("-b", "--body-width", dest="body_width", action="store", - type="int", default=BODY_WIDTH, + type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", - action="store", type="int", default=GOOGLE_LIST_INDENT, + action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, From 75dfb51b436e5a6888be6ba0207fb625f0176d73 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:22:04 +0400 Subject: [PATCH 058/479] Code eye candy everybody --- html2text/config.py | 54 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/html2text/config.py b/html2text/config.py index 2d056726..9c4d9817 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -1,6 +1,6 @@ -# Use Unicode characters instead of their ascii psuedo-replacements import re +# Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 # Escape all special characters. Output is less readable, but avoids @@ -31,15 +31,43 @@ # For checking space-only lines on line 771 SPACE_RE = re.compile(r'\s\+') - - -unifiable = {'rsquo': "'", 'lsquo': "'", 'rdquo': '"', 'ldquo': '"', - 'copy': '(C)', 'mdash': '--', 'nbsp': ' ', 'rarr': '->', - 'larr': '<-', 'middot': '*', 'ndash': '-', 'oelig': 'oe', - 'aelig': 'ae', 'agrave': 'a', 'aacute': 'a', 'acirc': 'a', - 'atilde': 'a', 'auml': 'a', 'aring': 'a', 'egrave': 'e', - 'eacute': 'e', 'ecirc': 'e', 'euml': 'e', 'igrave': 'i', - 'iacute': 'i', 'icirc': 'i', 'iuml': 'i', 'ograve': 'o', - 'oacute': 'o', 'ocirc': 'o', 'otilde': 'o', 'ouml': 'o', - 'ugrave': 'u', 'uacute': 'u', 'ucirc': 'u', 'uuml': 'u', - 'lrm': '', 'rlm': ''} +unifiable = { + 'rsquo': "'", + 'lsquo': "'", + 'rdquo': '"', + 'ldquo': '"', + 'copy': '(C)', + 'mdash': '--', + 'nbsp': ' ', + 'rarr': '->', + 'larr': '<-', + 'middot': '*', + 'ndash': '-', + 'oelig': 'oe', + 'aelig': 'ae', + 'agrave': 'a', + 'aacute': 'a', + 'acirc': 'a', + 'atilde': 'a', + 'auml': 'a', + 'aring': 'a', + 'egrave': 'e', + 'eacute': 'e', + 'ecirc': 'e', + 'euml': 'e', + 'igrave': 'i', + 'iacute': 'i', + 'icirc': 'i', + 'iuml': 'i', + 'ograve': 'o', + 'oacute': 'o', + 'ocirc': 'o', + 'otilde': 'o', + 'ouml': 'o', + 'ugrave': 'u', + 'uacute': 'u', + 'ucirc': 'u', + 'uuml': 'u', + 'lrm': '', + 'rlm': '' +} From f93441c184fd5efeff6abccfc8b4c180cc197be9 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:41:53 +0400 Subject: [PATCH 059/479] Eye candy and docstring --- html2text/__init__.py | 67 ++++++++++++++++++++++++++++++++----------- html2text/config.py | 1 - 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 985967c5..95175967 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -44,8 +44,6 @@ def name2cp(k): unifiable_n[name2cp(k)] = config.unifiable[k] - -### End Entity Nonsense ### def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: @@ -58,7 +56,8 @@ def hn(tag): def dumb_property_dict(style): """ - :returns: A hash of css attributes""" + :returns: A hash of css attributes + """ out = dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]) @@ -113,7 +112,13 @@ def element_style(attrs, style_def, parent_style): def google_list_style(style): - """finds out whether this is an ordered or unordered list""" + """ + Finds out whether this is an ordered or unordered list + + :type style: dict + + :rtype: str + """ if 'list-style-type' in style: list_style = style['list-style-type'] if list_style in ['disc', 'circle', 'square', 'none']: @@ -122,15 +127,26 @@ def google_list_style(style): def google_has_height(style): - """check if the style of the element has the 'height' attribute - explicitly defined""" + """ + Check if the style of the element has the 'height' attribute + explicitly defined + + :type style: dict + + :rtype: bool + """ if 'height' in style: return True return False def google_text_emphasis(style): - """return a list of all emphasis modifiers of the element""" + """ + :type style: dict + + :returns: A list of all emphasis modifiers of the element + :rtype: list + """ emphasis = [] if 'text-decoration' in style: emphasis.append(style['text-decoration']) @@ -142,7 +158,13 @@ def google_text_emphasis(style): def google_fixed_width_font(style): - """check if the css of the current element defines a fixed width font""" + """ + Check if the css of the current element defines a fixed width font + + :type style: dict + + :rtype: bool + """ font_family = '' if 'font-family' in style: font_family = style['font-family'] @@ -152,7 +174,13 @@ def google_fixed_width_font(style): def list_numbering_start(attrs): - """extract numbering from list element attributes""" + """ + Extract numbering from list element attributes + + :type attrs: dict + + :rtype: int or None + """ if 'start' in attrs: try: return int(attrs['start']) - 1 @@ -291,10 +319,12 @@ def handle_endtag(self, tag): self.handle_tag(tag, None, 0) def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list + """ + :type attrs: dict - If the set of attributes is not found, returns None + :returns: The index of certain set of attributes (of a link) in the + self.a list. If the set of attributes is not found, returns None + :rtype: int """ if 'href' not in attrs: return None @@ -316,7 +346,9 @@ def previousIndex(self, attrs): return i def handle_emphasis(self, start, tag_style, parent_style): - """handles various text emphases""" + """ + Handles various text emphases + """ tag_emphasis = google_text_emphasis(tag_style) parent_emphasis = google_text_emphasis(parent_style) @@ -376,7 +408,6 @@ def handle_emphasis(self, start, tag_style, parent_style): self.quiet -= 1 def handle_tag(self, tag, attrs, start): - #attrs = fixattrs(attrs) # attrs is None for endtags if attrs is None: attrs = {} @@ -770,7 +801,9 @@ def unescape(self, s): return self.r_unescape.sub(self.replaceEntities, s) def google_nest_count(self, style): - """calculate the nesting count of google doc lists""" + """ + Calculate the nesting count of google doc lists + """ nest_count = 0 if 'margin-left' in style: nest_count = int(style['margin-left'][:-2]) \ @@ -778,7 +811,9 @@ def google_nest_count(self, style): return nest_count def optwrap(self, text): - """Wrap all paragraphs in the provided text.""" + """ + Wrap all paragraphs in the provided text. + """ if not self.body_width: return text diff --git a/html2text/config.py b/html2text/config.py index 9c4d9817..03d96c00 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -27,7 +27,6 @@ IGNORE_IMAGES = False IGNORE_EMPHASIS = False -### Entity Nonsense ### # For checking space-only lines on line 771 SPACE_RE = re.compile(r'\s\+') From b1733bf8a160558eb77eaf3d5a0fb95424a72205 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:43:20 +0400 Subject: [PATCH 060/479] Docstring on escape_md_section --- html2text/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 95175967..07b6abe5 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -925,8 +925,11 @@ def escape_md(text): def escape_md_section(text, snob=False): - """Escapes markdown-sensitive characters across whole document sections.""" - text = md_backslash_matcher.sub(r"\\\1", text) + """ + Escapes markdown-sensitive characters across whole document sections. + """ + text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) + if snob: text = md_chars_matcher_all.sub(r"\\\1", text) text = md_dot_matcher.sub(r"\1\\\2", text) From ce58eb5bb04e01d5d2a66e432fbbf0497710a145 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:43:36 +0400 Subject: [PATCH 061/479] Docstring on escape_md --- html2text/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 07b6abe5..35424637 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -919,9 +919,11 @@ def unescape(s, unicode_snob=False): def escape_md(text): - """Escapes markdown-sensitive characters within other markdown - constructs.""" - return md_chars_matcher.sub(r"\\\1", text) + """ + Escapes markdown-sensitive characters within other markdown + constructs. + """ + return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) def escape_md_section(text, snob=False): From dc3dfc6b6de5b8dc190470aabbe85ed42628281b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:44:03 +0400 Subject: [PATCH 062/479] remove regex compiled vars to config --- html2text/__init__.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 35424637..e86501e8 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -844,36 +844,6 @@ def optwrap(self, text): newlines += 1 return result -ordered_list_matcher = re.compile(r'\d+\.\s') -unordered_list_matcher = re.compile(r'[-\*\+]\s') -md_chars_matcher = re.compile(r"([\\\[\]\(\)])") -md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])") -md_dot_matcher = re.compile(r""" - ^ # start of line - (\s*\d+) # optional whitespace and a number - (\.) # dot - (?=\s) # lookahead assert whitespace - """, re.MULTILINE | re.VERBOSE) -md_plus_matcher = re.compile(r""" - ^ - (\s*) - (\+) - (?=\s) - """, flags=re.MULTILINE | re.VERBOSE) -md_dash_matcher = re.compile(r""" - ^ - (\s*) - (-) - (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) - # or another dash (header or hr) - """, flags=re.MULTILINE | re.VERBOSE) -slash_chars = r'\`*_{}[]()#+-.!' -md_backslash_matcher = re.compile(r''' - (\\) # match one slash - (?=[%s]) # followed by a char that requires escaping - ''' % re.escape(slash_chars), - flags=re.VERBOSE) - def skipwrap(para): # If the text begins with four spaces or one tab, it's a code block; From c5c89e9a25cb2f5b6d7a7dee4219b3b2211b7b36 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:44:21 +0400 Subject: [PATCH 063/479] config var names refactoring --- html2text/__init__.py | 41 ++++++++++++++++++++++++----------------- html2text/config.py | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index e86501e8..21ad4fd2 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -9,7 +9,7 @@ # TODO: -# Support decoded entities with unifiable. +# Support decoded entities with UNIFIABLE. try: import htmlentitydefs @@ -40,8 +40,8 @@ def name2cp(k): unifiable_n = {} -for k in config.unifiable.keys(): - unifiable_n[name2cp(k)] = config.unifiable[k] +for k in config.UNIFIABLE.keys(): + unifiable_n[name2cp(k)] = config.UNIFIABLE[k] def hn(tag): @@ -257,7 +257,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): del unifiable_n[name2cp('nbsp')] except KeyError: pass - config.unifiable['nbsp'] = ' _place_holder;' + config.UNIFIABLE['nbsp'] = ' _place_holder;' def feed(self, data): data = data.replace("", "") @@ -772,8 +772,8 @@ def charref(self, name): return chr(c) def entityref(self, c): - if not self.unicode_snob and c in config.unifiable.keys(): - return config.unifiable[c] + if not self.unicode_snob and c in config.UNIFIABLE.keys(): + return config.UNIFIABLE[c] else: try: name2cp(c) @@ -781,7 +781,7 @@ def entityref(self, c): return "&" + c + ';' else: if c == 'nbsp': - return config.unifiable[c] + return config.UNIFIABLE[c] else: try: return unichr(name2cp(c)) @@ -795,10 +795,8 @@ def replaceEntities(self, s): else: return self.entityref(s) - r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") - def unescape(self, s): - return self.r_unescape.sub(self.replaceEntities, s) + return config.RE_UNESCAPE.sub(self.replaceEntities, s) def google_nest_count(self, style): """ @@ -808,6 +806,7 @@ def google_nest_count(self, style): if 'margin-left' in style: nest_count = int(style['margin-left'][:-2]) \ // self.google_list_indent + return nest_count def optwrap(self, text): @@ -835,7 +834,7 @@ def optwrap(self, text): # Be aware that obvious replacement of this with # line.isspace() # DOES NOT work! Explanations are welcome. - if not config.SPACE_RE.match(para): + if not config.RE_SPACE.match(para): result += para + "\n" newlines = 1 else: @@ -850,22 +849,26 @@ def skipwrap(para): # don't wrap if para[0:4] == ' ' or para[0] == '\t': return True + # If the text begins with only two "--", possibly preceded by # whitespace, that's an emdash; so wrap. stripped = para.lstrip() if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": return False + # I'm not sure what this is for; I thought it was to detect lists, # but there's a
      -inside- case in one of the tests that # also depends upon it. if stripped[0:1] == '-' or stripped[0:1] == '*': return True + # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either # case optionally proceeded by whitespace), it's a list; don't wrap. - if ordered_list_matcher.match(stripped) or \ - unordered_list_matcher.match(stripped): + if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \ + config.RE_UNORDERED_LIST_MATCHER.match(stripped): return True + return False @@ -879,12 +882,14 @@ def wrapwrite(text): def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) + return h.handle(html) def unescape(s, unicode_snob=False): h = HTML2Text() h.unicode_snob = unicode_snob + return h.unescape(s) @@ -903,10 +908,12 @@ def escape_md_section(text, snob=False): text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) if snob: - text = md_chars_matcher_all.sub(r"\\\1", text) - text = md_dot_matcher.sub(r"\1\\\2", text) - text = md_plus_matcher.sub(r"\1\\\2", text) - text = md_dash_matcher.sub(r"\1\\\2", text) + text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) + + text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) + return text diff --git a/html2text/config.py b/html2text/config.py index 03d96c00..f87b5241 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -28,9 +28,42 @@ IGNORE_EMPHASIS = False # For checking space-only lines on line 771 -SPACE_RE = re.compile(r'\s\+') +RE_SPACE = re.compile(r'\s\+') -unifiable = { +RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") +RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s') +RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') +RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") +RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])") +RE_MD_DOT_MATCHER = re.compile(r""" + ^ # start of line + (\s*\d+) # optional whitespace and a number + (\.) # dot + (?=\s) # lookahead assert whitespace + """, re.MULTILINE | re.VERBOSE) +RE_MD_PLUS_MATCHER = re.compile(r""" + ^ + (\s*) + (\+) + (?=\s) + """, flags=re.MULTILINE | re.VERBOSE) +RE_MD_DASH_MATCHER = re.compile(r""" + ^ + (\s*) + (-) + (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) + # or another dash (header or hr) + """, flags=re.MULTILINE | re.VERBOSE) +RE_SLASH_CHARS = r'\`*_{}[]()#+-.!' +RE_MD_BACKSLASH_MATCHER = re.compile(r''' + (\\) # match one slash + (?=[%s]) # followed by a char that requires escaping + ''' % re.escape(RE_SLASH_CHARS), + flags=re.VERBOSE) + + + +UNIFIABLE = { 'rsquo': "'", 'lsquo': "'", 'rdquo': '"', From 08c136697e1006b1b45ef7f80b9813ce74b9af8f Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:47:47 +0400 Subject: [PATCH 064/479] Eye candy on defining options --- html2text/__init__.py | 106 +++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 21ad4fd2..d0515539 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -922,39 +922,79 @@ def main(): p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) - p.add_option("--ignore-emphasis", dest="ignore_emphasis", - action="store_true", default=config.IGNORE_EMPHASIS, - help="don't include any formatting for emphasis") - p.add_option("--ignore-links", dest="ignore_links", action="store_true", - default=config.IGNORE_ANCHORS, - help="don't include any formatting for links") - p.add_option("--ignore-images", dest="ignore_images", action="store_true", - default=config.IGNORE_IMAGES, - help="don't include any formatting for images") - p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", - default=False, - help="convert an html-exported Google Document") - p.add_option("-d", "--dash-unordered-list", action="store_true", - dest="ul_style_dash", default=False, - help="use a dash rather than a star for unordered list items") - p.add_option("-e", "--asterisk-emphasis", action="store_true", - dest="em_style_asterisk", default=False, - help="use an asterisk rather than an underscore " + - "for emphasized text") - p.add_option("-b", "--body-width", dest="body_width", action="store", - type="int", default=config.BODY_WIDTH, - help="number of characters per output line, 0 for no wrap") - p.add_option("-i", "--google-list-indent", dest="list_indent", - action="store", type="int", default=config.GOOGLE_LIST_INDENT, - help="number of pixels Google indents nested lists") - p.add_option("-s", "--hide-strikethrough", action="store_true", - dest="hide_strikethrough", default=False, - help="hide strike-through text. only relevant when -g is " + - "specified as well") - p.add_option("--escape-all", action="store_true", dest="escape_snob", - default=False, - help="Escape all special characters. Output is less " + - "readable, but avoids corner case formatting issues.") + p.add_option( + "--ignore-emphasis", + dest="ignore_emphasis", + action="store_true", + default=config.IGNORE_EMPHASIS, + help="don't include any formatting for emphasis" + ) + p.add_option( + "--ignore-links", + dest="ignore_links", + action="store_true", + default=config.IGNORE_ANCHORS, + help="don't include any formatting for links") + p.add_option( + "--ignore-images", + dest="ignore_images", + action="store_true", + default=config.IGNORE_IMAGES, + help="don't include any formatting for images" + ) + p.add_option( + "-g", "--google-doc", + action="store_true", + dest="google_doc", + default=False, + help="convert an html-exported Google Document" + ) + p.add_option( + "-d", "--dash-unordered-list", + action="store_true", + dest="ul_style_dash", + default=False, + help="use a dash rather than a star for unordered list items" + ) + p.add_option( + "-e", "--asterisk-emphasis", + action="store_true", + dest="em_style_asterisk", + default=False, + help="use an asterisk rather than an underscore for emphasized text" + ) + p.add_option( + "-b", "--body-width", + dest="body_width", + action="store", + type="int", + default=config.BODY_WIDTH, + help="number of characters per output line, 0 for no wrap" + ) + p.add_option( + "-i", "--google-list-indent", + dest="list_indent", + action="store", + type="int", + default=config.GOOGLE_LIST_INDENT, + help="number of pixels Google indents nested lists" + ) + p.add_option( + "-s", "--hide-strikethrough", + action="store_true", + dest="hide_strikethrough", + default=False, + help="hide strike-through text. only relevant when -g is " + "specified as well" + ) + p.add_option( + "--escape-all", + action="store_true", + dest="escape_snob", + default=False, + help="Escape all special characters. Output is less readable, but " + "avoids corner case formatting issues." + ) (options, args) = p.parse_args() # process input From 2f33f950be23c11c810703de65507429b63b30f4 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 4 Jul 2014 11:50:31 +0400 Subject: [PATCH 065/479] Add --escape-all option to readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ddcb745e..7598f7fe 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ Usage: `html2text.py [(filename|url) [encoding]]` |`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap |`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists |`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well +|`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. + Or you can use it from within `Python`: From 0d50cac958c144eaa147b923453340a1d0b19b29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Fri, 4 Jul 2014 12:14:14 +0200 Subject: [PATCH 066/479] Fix setup.py test to actually fail when it should. We should really move to setuptools proper. --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0b2cf08a..cc7bf103 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ # coding: utf-8 import sys -from setuptools import setup, find_packages, Command +from setuptools import setup, Command requires_list = [] try: @@ -28,7 +28,8 @@ def finalize_options(self): def run(self): tests = unittest.TestLoader().discover('.') runner = unittest.TextTestRunner() - runner.run(tests) + results = runner.run(tests) + sys.exit(not results.wasSuccessful()) setup( From 30aec19fd5b8d382bfe842ffa99572237d2ead2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Fri, 4 Jul 2014 14:03:45 +0200 Subject: [PATCH 067/479] Python 3.2 doesn't have unicode strings. Fix #14 --- test/test_memleak.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_memleak.py b/test/test_memleak.py index fd2ae18e..27999fbc 100644 --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -29,4 +29,4 @@ def test_empty_string(self): h2t = html2text.HTML2Text() h2t.handle(self.instr) # And even less when the input is empty - self.assertEqual(h2t.handle(''), u'\n\n') + self.assertEqual(h2t.handle(''), '\n\n') From 2d62f7837fd6943f6aeeca52ec6822cf7b7b5282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Boulogne?= Date: Fri, 4 Jul 2014 21:22:17 -0400 Subject: [PATCH 068/479] add python 3.4 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 79639e30..66b5fe40 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "pypy" - "3.2" - "3.3" + - "3.4" install: - pip install coveralls before_script: From 01d21b37cf7b1e17062fdd8caa8d72fdbac7988b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 5 Jul 2014 05:34:05 +0400 Subject: [PATCH 069/479] Fix badge status before going for pypim --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7598f7fe..58a7fb42 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # [html2text](http://www.aaronsw.com/2002/html2text/) -[![Build Status](https://secure.travis-ci.org/html2text/html2text.png)](http://travis-ci.org/html2text/html2text) -[![Coverage Status](https://coveralls.io/repos/htmL2text/html2text/badge.png)](https://coveralls.io/r/htmL2text/html2text) +[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) +[![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From 9b0223d57ca0699a8e99492c27407facf2cfccc9 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 03:41:45 +0400 Subject: [PATCH 070/479] Add more docstring to optwrap --- html2text/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/html2text/__init__.py b/html2text/__init__.py index d0515539..0b641892 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -812,6 +812,10 @@ def google_nest_count(self, style): def optwrap(self, text): """ Wrap all paragraphs in the provided text. + + :type text: str + + :rtype: str """ if not self.body_width: return text From f69a8e09f689a6ba162d43faca40206769bd4b55 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 03:41:56 +0400 Subject: [PATCH 071/479] Add more docstring to google_nest_count --- html2text/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/html2text/__init__.py b/html2text/__init__.py index 0b641892..4a3bdc75 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -801,6 +801,10 @@ def unescape(self, s): def google_nest_count(self, style): """ Calculate the nesting count of google doc lists + + :type style: dict + + :rtype: int """ nest_count = 0 if 'margin-left' in style: From c340b99bb0a808803b7d222a9cdf557ba0513f34 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 03:42:18 +0400 Subject: [PATCH 072/479] Removed 2 blank lines to make the file fully PEP8 ;) --- html2text/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/html2text/config.py b/html2text/config.py index f87b5241..8fe02a23 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -61,8 +61,6 @@ ''' % re.escape(RE_SLASH_CHARS), flags=re.VERBOSE) - - UNIFIABLE = { 'rsquo': "'", 'lsquo': "'", From c1cb8cb3f9912ac6e11dd1b6d4039ea086d9990e Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:10:15 +0400 Subject: [PATCH 073/479] Set the issue number --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 09e12ea5..36fb24d1 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,7 +4,7 @@ * Fix #8: Remove ``How to do a release`` section from README.md. * Fix #11: Include test directory markdown, html files. -* Fix memory leak in using ``handle`` while keeping the old instance of ``html2text``. +* Fix #13: memory leak in using ``handle`` while keeping the old instance of ``html2text``. 2014.4.5 - 2014-04-05 From 10181387695af6682c6168168a827b877c0c5b68 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:10:33 +0400 Subject: [PATCH 074/479] Update the changelog --- ChangeLog.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 36fb24d1..dca93c11 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,13 @@ +0000.0.0 - 0000-00-00 +===================== +---- + +* Fix ``unicode``/``type`` error in memory leak unit-test. +* Feature #18: Add ``Python`` ``3.4`` to travis config file. +* Feature #19: Bring ``html2text`` to a separate module and take out the ``conf``/``constant`` variables. +* Feature #21: Remove meta vars from ``html2text.py`` file header. +* + 2014.7.3 - 2014-07-03 ===================== ---- From efa5200d15304dba5ae1b4fd290b0c9758cc28fc Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:15:59 +0400 Subject: [PATCH 075/479] Remove install_deps.py --- ChangeLog.rst | 3 ++- install_deps.py | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 install_deps.py diff --git a/ChangeLog.rst b/ChangeLog.rst index dca93c11..0344551b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,10 +3,11 @@ ---- * Fix ``unicode``/``type`` error in memory leak unit-test. +* Feature #16: Remove ``install_deps.py``. * Feature #18: Add ``Python`` ``3.4`` to travis config file. * Feature #19: Bring ``html2text`` to a separate module and take out the ``conf``/``constant`` variables. * Feature #21: Remove meta vars from ``html2text.py`` file header. -* + 2014.7.3 - 2014-07-03 ===================== diff --git a/install_deps.py b/install_deps.py deleted file mode 100644 index 51ec53aa..00000000 --- a/install_deps.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/python -import sys -import subprocess - -if sys.version_info[:2] < (2, 7): - subprocess.call('pip install unittest2 --use-mirrors', shell=True) From a020e4207b2a25ff6ca42e5ed9dd5eefd860432b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:37:05 +0400 Subject: [PATCH 076/479] Add Downloads --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 58a7fb42..09316977 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) +[![Downloads](https://pypip.in/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From c17e48f5b52fb7e986f6a46997fec02bd501a36d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:37:13 +0400 Subject: [PATCH 077/479] Add version badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 09316977..e9628c7b 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) [![Downloads](https://pypip.in/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Version](https://pypip.in/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From 84c73f7d6cddd592d24dc4c62cb2e865c6e17439 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:37:25 +0400 Subject: [PATCH 078/479] Add Egg badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e9628c7b..4b9a36f8 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) [![Downloads](https://pypip.in/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Version](https://pypip.in/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Egg?](https://pypip.in/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From 7f0d6dec88c1ba8af5ef2d027a6a6026ccffc642 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:37:35 +0400 Subject: [PATCH 079/479] Add Wheel badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4b9a36f8..4311049b 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![Downloads](https://pypip.in/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Version](https://pypip.in/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Egg?](https://pypip.in/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Wheel?](https://pypip.in/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From a8e73a82b3a3bfe91b6eed5227d8870d5b0b4177 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:37:45 +0400 Subject: [PATCH 080/479] Add format badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4311049b..54557ef4 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![Version](https://pypip.in/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Egg?](https://pypip.in/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Wheel?](https://pypip.in/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Format](https://pypip.in/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From e47b9ce86146014ebc01ad63d8d7c1bc19ca215c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:37:57 +0400 Subject: [PATCH 081/479] Add License badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 54557ef4..ceafb3c2 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [![Egg?](https://pypip.in/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Wheel?](https://pypip.in/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) [![Format](https://pypip.in/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![License](https://pypip.in/license/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From d2de82921abbb5fa9e5e704c676c62e9c2c72c0c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 7 Jul 2014 04:39:29 +0400 Subject: [PATCH 082/479] Update changelog --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 0344551b..83cc1f5b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,6 +4,7 @@ * Fix ``unicode``/``type`` error in memory leak unit-test. * Feature #16: Remove ``install_deps.py``. +* Feature #17: Add status badges via pypin. * Feature #18: Add ``Python`` ``3.4`` to travis config file. * Feature #19: Bring ``html2text`` to a separate module and take out the ``conf``/``constant`` variables. * Feature #21: Remove meta vars from ``html2text.py`` file header. From aa9ae9cda56d6f3e38544bd6cff080fb59a37be6 Mon Sep 17 00:00:00 2001 From: Eben Freeman Date: Fri, 8 Aug 2014 17:21:33 -0700 Subject: [PATCH 083/479] Fix TypeError when parsing tags like In this case, doing attrs.get('alt', '') would return None, because attrs actually looks like {'src': 'foo', 'alt': None}. This caused an unhandled TypeError in escape_md when trying to do regular-expression matching on the None value. --- html2text/__init__.py | 2 +- test/url-escaping.html | 1 + test/url-escaping.md | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 4a3bdc75..1d1b3ec6 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -542,7 +542,7 @@ def handle_tag(self, tag, attrs, start): if tag == "img" and start and not self.ignore_images: if 'src' in attrs: attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') + alt = attrs.get('alt') or '' self.o("![" + escape_md(alt) + "]") if self.inline_links: diff --git a/test/url-escaping.html b/test/url-escaping.html index 5c5693ce..c4f68b9c 100644 --- a/test/url-escaping.html +++ b/test/url-escaping.html @@ -16,3 +16,4 @@

      Markdown-sensible characters processing

      [banana]
      {banana}
      ([{}]) + diff --git a/test/url-escaping.md b/test/url-escaping.md index ffb2bc87..f409d406 100644 --- a/test/url-escaping.md +++ b/test/url-escaping.md @@ -16,4 +16,5 @@ And here are images with tricky attribute values: ![\[banana\]](http://placehold.it/350x150#\[banana\]) ![{banana}](http://placehold.it/350x150#{banana}) ![\(\[{}\]\)](http://placehold.it/350x150#\(\[{}\]\)) +![](http://placehold.it/350x150#\(\[{}\]\)) From 53eb2a5884ed9e489bee079e4a6f9071ca072515 Mon Sep 17 00:00:00 2001 From: Eben Freeman Date: Fri, 8 Aug 2014 18:07:44 -0700 Subject: [PATCH 084/479] Fix TypeError when attrs['href'] is None in handle_tag. --- html2text/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1d1b3ec6..e6f61200 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -546,7 +546,8 @@ def handle_tag(self, tag, attrs, start): self.o("![" + escape_md(alt) + "]") if self.inline_links: - self.o("(" + escape_md(attrs['href']) + ")") + href = attrs.get('href') or '' + self.o("(" + escape_md(href) + ")") else: i = self.previousIndex(attrs) if i is not None: From 750a66d7074cc13cfdf8a736df40a2f6ed637a8a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 7 Sep 2014 17:21:20 +0400 Subject: [PATCH 085/479] Update changelog for #25 --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 83cc1f5b..01bb6264 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -8,6 +8,7 @@ * Feature #18: Add ``Python`` ``3.4`` to travis config file. * Feature #19: Bring ``html2text`` to a separate module and take out the ``conf``/``constant`` variables. * Feature #21: Remove meta vars from ``html2text.py`` file header. +* Fix: Fix TypeError when parsing tags like . Fixed in #25. 2014.7.3 - 2014-07-03 From db19569c9bcbfc96e8c1d61aa03232a608c34191 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 7 Sep 2014 17:21:57 +0400 Subject: [PATCH 086/479] Better to be authors/contributors ;) --- AUTHORS.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index b62454d1..bf17c10f 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -1,6 +1,6 @@ ``html2text`` was Originally written by Aaron Swartz. -The AUTHORS are (and/or have been): +The AUTHORS/Contributors are (and/or have been): * Aaron Swartz * Yariv Barkan From f24283171c663502ce90d5e4bc418fdb575bf8de Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 7 Sep 2014 17:22:18 +0400 Subject: [PATCH 087/479] Bump version for release 2014.9.7 --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 01bb6264..a0632086 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.0.0 - 0000-00-00 +2014.9.7 - 2014-09-07 ===================== ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index e6f61200..6d0b7007 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -5,7 +5,7 @@ from html2text import config -__version__ = "2014.7.3" +__version__ = "2014.9.7" # TODO: diff --git a/setup.py b/setup.py index 550e4f70..439d2a4d 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.7.3", + version="2014.9.7", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 4d145e3f801ac54b15e23cf60e1ad0436717c0b1 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 8 Sep 2014 17:08:59 +0400 Subject: [PATCH 088/479] Include html2text pkg Fixes #28 --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 439d2a4d..63c3f382 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ # coding: utf-8 import sys -from setuptools import setup, Command +from setuptools import setup, Command, find_packages requires_list = [] try: @@ -67,6 +67,7 @@ def run(self): license='GNU GPL 3', requires=requires_list, py_modules=['html2text'], + packages=find_packages(), include_package_data=True, zip_safe=False, ) From 09fb2d4d46accd71887f32db7655df11e1aa1fc0 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 8 Sep 2014 17:14:33 +0400 Subject: [PATCH 089/479] Bump the version number * Update the changelog --- ChangeLog.rst | 7 +++++++ html2text/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index a0632086..52a440fa 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +2014.9.8 - 2014-09-08 +===================== +---- + +* Fix #28: missing ``html2text`` package in installation. + + 2014.9.7 - 2014-09-07 ===================== ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 6d0b7007..6f1badbf 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -5,7 +5,7 @@ from html2text import config -__version__ = "2014.9.7" +__version__ = "2014.9.8" # TODO: diff --git a/setup.py b/setup.py index 63c3f382..7a475634 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.9.7", + version="2014.9.8", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From a2190c53f3dd7089e1b1022826fc829b32a24fb9 Mon Sep 17 00:00:00 2001 From: Ivan Gromov Date: Thu, 11 Sep 2014 23:50:33 +0600 Subject: [PATCH 090/479] Added simple tables support with bypass option --- html2text/__init__.py | 43 ++++++++++++++++++++++++++++++--- html2text/config.py | 2 ++ test/doc_with_table.html | 32 ++++++++++++++++++++++++ test/doc_with_table.md | 12 +++++++++ test/doc_with_table_bypass.html | 32 ++++++++++++++++++++++++ test/doc_with_table_bypass.md | 13 ++++++++++ test/test_html2text.py | 4 +++ 7 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 test/doc_with_table.html create mode 100644 test/doc_with_table.md create mode 100644 test/doc_with_table_bypass.html create mode 100644 test/doc_with_table_bypass.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 6f1badbf..a9155beb 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -201,6 +201,9 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): HTMLParser.HTMLParser.__init__(self) # Config options + self.split_next_td = False + self.th_count = 0 + self.th_start = False self.unicode_snob = config.UNICODE_SNOB self.escape_snob = config.ESCAPE_SNOB self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH @@ -211,6 +214,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.ignore_links = config.IGNORE_ANCHORS self.ignore_images = config.IGNORE_IMAGES self.ignore_emphasis = config.IGNORE_EMPHASIS + self.bypass_tables = config.BYPASS_TABLES self.google_doc = False self.ul_item_mark = '*' self.emphasis_mark = '_' @@ -609,10 +613,33 @@ def handle_tag(self, tag, attrs, start): self.o(str(li['num']) + ". ") self.start = 1 - if tag in ["table", "tr"] and start: - self.p() - if tag == 'td': - self.pbr() + if tag in ["table", "tr", "td", "th"]: + if self.bypass_tables: + if tag == "tr" and start: + self.soft_br() + if start: + self.o('<{0}>'.format(tag)) + else: + self.o(''.format(tag)) + else: + if tag in ["td", "th"] and start: + if self.split_next_td: + self.o("| ") + self.split_next_td = True + + if tag == "tr" and start: + self.th_count = 0 + if tag == "tr" and not start: + self.split_next_td = False + self.soft_br() + if tag == "tr" and not start and self.th_start: + # Underline table header + self.o("|".join(["---"] * self.th_count)) + self.soft_br() + self.th_start = False + if tag == "th" and start: + self.th_start = True + self.th_count += 1 if tag == "pre": if start: @@ -1004,6 +1031,13 @@ def main(): help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues." ) + p.add_option( + "--bypass-tables", + action="store_true", + dest="bypass_tables", + default=config.BYPASS_TABLES, + help="Format tables in HTML rather than Markdown syntax." + ) (options, args) = p.parse_args() # process input @@ -1055,6 +1089,7 @@ def main(): h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob + h.bypass_tables = options.bypass_tables wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index 8fe02a23..f932024d 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -101,3 +101,5 @@ 'lrm': '', 'rlm': '' } + +BYPASS_TABLES = False \ No newline at end of file diff --git a/test/doc_with_table.html b/test/doc_with_table.html new file mode 100644 index 00000000..fa64b370 --- /dev/null +++ b/test/doc_with_table.html @@ -0,0 +1,32 @@ + + + + + + + + +

      This is a test document

      + +With some text, code, bolds and italics. + +

      This is second header

      + +

      Displaynone text

      + + + + + + + + + + + + + +
      Header 1Header 2Header 3
      Content 1Content 2200 Image!
      + + + \ No newline at end of file diff --git a/test/doc_with_table.md b/test/doc_with_table.md new file mode 100644 index 00000000..7e06a3a0 --- /dev/null +++ b/test/doc_with_table.md @@ -0,0 +1,12 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 | Header 2 | Header 3 +---|---|--- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! + diff --git a/test/doc_with_table_bypass.html b/test/doc_with_table_bypass.html new file mode 100644 index 00000000..fa64b370 --- /dev/null +++ b/test/doc_with_table_bypass.html @@ -0,0 +1,32 @@ + + + + + + + + +

      This is a test document

      + +With some text, code, bolds and italics. + +

      This is second header

      + +

      Displaynone text

      + + + + + + + + + + + + + +
      Header 1Header 2Header 3
      Content 1Content 2200 Image!
      + + + \ No newline at end of file diff --git a/test/doc_with_table_bypass.md b/test/doc_with_table_bypass.md new file mode 100644 index 00000000..79df6ba1 --- /dev/null +++ b/test/doc_with_table_bypass.md @@ -0,0 +1,13 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2![200](http://lorempixel.com/200/200) Image!
      + diff --git a/test/test_html2text.py b/test/test_html2text.py index 929a6afd..78b8bdd0 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -119,6 +119,10 @@ def test_cmd(self): module_args['escape_snob'] = True cmdline_args.append('--escape-all') + if base_fn.find('table_bypass') >= 0: + module_args['bypass_tables'] = True + cmdline_args.append('--bypass-tables') + if base_fn.startswith('bodywidth'): #module_args['unicode_snob'] = True module_args['body_width'] = 0 From da214cabd55c03bb2d9e3f5399f9ac397acc7bd5 Mon Sep 17 00:00:00 2001 From: Ivan Gromov Date: Sat, 13 Sep 2014 13:24:27 +0600 Subject: [PATCH 091/479] Updated table implementation --- html2text/__init__.py | 36 +++++++++++++++++++------------- test/doc_with_table.html | 6 +++++- test/doc_with_table.md | 1 + test/doc_with_table_bypass.md | 39 ++++++++++++++++++++++++++++++----- 4 files changed, 62 insertions(+), 20 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index a9155beb..59becada 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -2,7 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division -from html2text import config +import config __version__ = "2014.9.8" @@ -202,8 +202,8 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): # Config options self.split_next_td = False - self.th_count = 0 - self.th_start = False + self.td_count = 0 + self.table_start = False self.unicode_snob = config.UNICODE_SNOB self.escape_snob = config.ESCAPE_SNOB self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH @@ -615,31 +615,39 @@ def handle_tag(self, tag, attrs, start): if tag in ["table", "tr", "td", "th"]: if self.bypass_tables: - if tag == "tr" and start: - self.soft_br() if start: - self.o('<{0}>'.format(tag)) + self.soft_br() + if tag in ["td", "th"]: + if start: + self.o('<{0}>\n\n'.format(tag)) + else: + self.o('\n'.format(tag)) else: - self.o(''.format(tag)) + if start: + self.o('<{0}>'.format(tag)) + else: + self.o(''.format(tag)) + else: + if tag == "table" and start: + self.table_start = True if tag in ["td", "th"] and start: if self.split_next_td: self.o("| ") self.split_next_td = True if tag == "tr" and start: - self.th_count = 0 + self.td_count = 0 if tag == "tr" and not start: self.split_next_td = False self.soft_br() - if tag == "tr" and not start and self.th_start: + if tag == "tr" and not start and self.table_start: # Underline table header - self.o("|".join(["---"] * self.th_count)) + self.o("|".join(["---"] * self.td_count)) self.soft_br() - self.th_start = False - if tag == "th" and start: - self.th_start = True - self.th_count += 1 + self.table_start = False + if tag in ["td", "th"] and start: + self.td_count += 1 if tag == "pre": if start: diff --git a/test/doc_with_table.html b/test/doc_with_table.html index fa64b370..96b91931 100644 --- a/test/doc_with_table.html +++ b/test/doc_with_table.html @@ -25,7 +25,11 @@

      This is second header

      Content 2 200 Image! - + + Content 1 + Content 2 + 200 Image! + diff --git a/test/doc_with_table.md b/test/doc_with_table.md index 7e06a3a0..5cb1986e 100644 --- a/test/doc_with_table.md +++ b/test/doc_with_table.md @@ -8,5 +8,6 @@ Displaynone text Header 1 | Header 2 | Header 3 ---|---|--- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! diff --git a/test/doc_with_table_bypass.md b/test/doc_with_table_bypass.md index 79df6ba1..d3255a79 100644 --- a/test/doc_with_table_bypass.md +++ b/test/doc_with_table_bypass.md @@ -4,10 +4,39 @@ With some text, `code`, **bolds** and _italics_. ## This is second header -Displaynone text - +Displaynone text + - - -
      Header 1 Header 2 Header 3
      Content 1 Content 2![200](http://lorempixel.com/200/200) Image!
      + + + +Header 1 + + + + +Header 2 + + + + +Header 3 + + + + + +Content 1 + + + + +Content 2 + + + + +![200](http://lorempixel.com/200/200) Image! + + From fdb94c6252baf2719e6a7786d3d21054d450fd2e Mon Sep 17 00:00:00 2001 From: Ivan Gromov Date: Sat, 13 Sep 2014 13:33:00 +0600 Subject: [PATCH 092/479] Fixed import typo --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 59becada..bbdab7b6 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -2,7 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division -import config +from html2text import config __version__ = "2014.9.8" From 98abe1aad101f1c1e688c337570ae2dd152418dc Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 19 Sep 2014 18:05:08 +0400 Subject: [PATCH 093/479] Remove old website address --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ceafb3c2..3eabd823 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# [html2text](http://www.aaronsw.com/2002/html2text/) +# html2text [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) From ed460442caee3b649a8c3de3e6fbc0cdecee05ac Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Sep 2014 19:35:47 +0400 Subject: [PATCH 094/479] Add @Alir3z4 email address --- AUTHORS.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index bf17c10f..99015845 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,4 +10,4 @@ The AUTHORS/Contributors are (and/or have been): Maintainer: - * Alireza Savand + * Alireza Savand From d039089a920b8d04c35226e30970b82444fdc251 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Sep 2014 19:36:10 +0400 Subject: [PATCH 095/479] Add Ivan Gromov @summerisgone to Authors Thank you for your contribution ;) --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 99015845..e4e37c3b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -7,6 +7,7 @@ The AUTHORS/Contributors are (and/or have been): * Alex Musayev * Matěj Cepl * Stefano Rivera + * Ivan Gromov Maintainer: From 09f267fc5602a930e39814f3b5e08bb0c7f38086 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Sep 2014 19:36:36 +0400 Subject: [PATCH 096/479] Add --bypass-tables options to README file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3eabd823..d41bdc77 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Usage: `html2text.py [(filename|url) [encoding]]` |`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists |`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. - +| `--bypass-tables` | Format tables in HTML rather than Markdown syntax. Or you can use it from within `Python`: From d2a39a31154f9475461415c324ec763138ff8be7 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Sep 2014 19:36:59 +0400 Subject: [PATCH 097/479] Update the changelog for adding table support --- ChangeLog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 52a440fa..dbd1e31b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +0000.00.00 - 0000-00-00 +===================== +---- + +* Feature #29, #27: Add simple table support with bypass option. + + 2014.9.8 - 2014-09-08 ===================== ---- From 1c977448fc4903c51be668f5b0ec357732cbf13c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Sep 2014 19:39:52 +0400 Subject: [PATCH 098/479] Update version number and getting ready to release new version --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index dbd1e31b..2b18a8e8 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.00.00 - 0000-00-00 +2014.9.25 - 2014-09-25 ===================== ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index bbdab7b6..5b32a39e 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -5,7 +5,7 @@ from html2text import config -__version__ = "2014.9.8" +__version__ = "2014.9.25" # TODO: diff --git a/setup.py b/setup.py index 7a475634..247441e5 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.9.8", + version="2014.9.25", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 8e9361f3e2eb026a6411167fa9f83baf3a5ae469 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Sep 2014 19:43:22 +0400 Subject: [PATCH 099/479] Update changelog for new website address http://alir3z4.github.io/html2text/ --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 2b18a8e8..bdbf678d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,7 +3,7 @@ ---- * Feature #29, #27: Add simple table support with bypass option. - +* Fix #20: Replace project website with: http://alir3z4.github.io/html2text/ . 2014.9.8 - 2014-09-08 ===================== From 11f3cdae50943ccef2d7a0de51116a60fac2f7bc Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 4 Oct 2014 04:36:33 +0400 Subject: [PATCH 100/479] Update code examples Better showing them in python shell style --- README.md | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d41bdc77..459f7c28 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,37 @@ Usage: `html2text.py [(filename|url) [encoding]]` Or you can use it from within `Python`: - import html2text - print html2text.html2text("

      Hello, world.

      ") +``` +>>> import html2text +>>> +>>> print(html2text.html2text("

      Zed's dead baby, Zed's dead.

      ")) +**Zed's** dead baby, _Zed's_ dead. + +``` + Or with some configuration options: +``` +>>> import html2text +>>> +>>> h = html2text.HTML2Text() +>>> # Ignore converting links from HTML +>>> h.ignore_links = True +>>> print h.handle("

      Hello, world!") +Hello, world! - import html2text - h = html2text.HTML2Text() - h.ignore_links = True - print h.handle("

      Hello, world!") +>>> print(h.handle("

      Hello, world!")) + +Hello, world! + +>>> # Don't Ignore links anymore, I like links +>>> h.ignore_links = False +>>> print(h.handle("

      Hello, world!")) +Hello, [world](http://earth.google.com/)! + +``` -_Originally written by Aaron Swartz. This code is distributed under the GPLv3._ +*Originally written by Aaron Swartz. This code is distributed under the GPLv3.* ## How to install From 540068f1b11ec5732666c48ffd1bf4419df4b019 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 5 Oct 2014 21:37:59 +0400 Subject: [PATCH 101/479] Remove py_modules Fixes #35 --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 247441e5..ce6ab6e6 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,6 @@ def run(self): """, license='GNU GPL 3', requires=requires_list, - py_modules=['html2text'], packages=find_packages(), include_package_data=True, zip_safe=False, From f7bf5428a0831f5648a2d95e4a0bd9fedcbda0ad Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 6 Oct 2014 12:21:30 +0400 Subject: [PATCH 102/479] Do not install tests `python2.7 setup.py install` needlessly installs `test` directory as `/usr/lib/python2.7/site-packages/test`. Inclusion of `test` directory in source tarball is already provided by `recursive-include test *.html *.md *.py` line in `MANIFEST.in`. - - - Fixes #36 Thank you @Arfrever --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ce6ab6e6..22244975 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def run(self): """, license='GNU GPL 3', requires=requires_list, - packages=find_packages(), + packages=find_packages(exclude=['test']), include_package_data=True, zip_safe=False, ) From 1203f62bf72311b9ca82cd18d7bb0d0c403250f4 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 6 Oct 2014 12:26:05 +0400 Subject: [PATCH 103/479] Update changlog for latest changes * Feature: Update `README.md` with usage examples. * Fix #35: Remove `py_modules` from `setup.py`. * Fix #36: Excludes tests from being installed as a separate module. --- ChangeLog.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index bdbf678d..e4d65aa7 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,12 @@ +0000.00.00 - 0000-00-00 +===================== +---- + +* Feature: Update `README.md` with usage examples. +* Fix #35: Remove `py_modules` from `setup.py`. +* Fix #36: Excludes tests from being installed as a separate module. + + 2014.9.25 - 2014-09-25 ===================== ---- @@ -5,6 +14,7 @@ * Feature #29, #27: Add simple table support with bypass option. * Fix #20: Replace project website with: http://alir3z4.github.io/html2text/ . + 2014.9.8 - 2014-09-08 ===================== ---- From f3523f3769a6e237d6756cb58c0fa01717a744b4 Mon Sep 17 00:00:00 2001 From: Stefano Rivera Date: Sat, 18 Oct 2014 11:45:12 -0700 Subject: [PATCH 104/479] Don't hardcode the path to the installed binary This allows us to run the same test suite against an installed version of the library. --- test/test_html2text.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index 78b8bdd0..2ea839d1 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -39,8 +39,7 @@ def test_module(fn, google_doc=False, **kwargs): def test_command(fn, *args): args = list(args) - cmd_name = os.path.join(os.path.dirname(fn), '..', 'html2text/__init__.py') - cmd = [sys.executable, cmd_name] + cmd = [sys.executable, '-m', 'html2text.__init__'] if '--googledoc' in args: args.remove('--googledoc') From 2e1963ebf906b25e019bc0d849a00b05c025809a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 30 Oct 2014 01:01:53 +0400 Subject: [PATCH 105/479] Fix a typo in module name for usage --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 459f7c28..ed156faa 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). -Usage: `html2text.py [(filename|url) [encoding]]` +Usage: `html2text [(filename|url) [encoding]]` | Option | Description From 27a00f276ca615406781f2773d5d5e26825ac215 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:20:07 +0400 Subject: [PATCH 106/479] Initial cli module --- html2text/cli.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 html2text/cli.py diff --git a/html2text/cli.py b/html2text/cli.py new file mode 100644 index 00000000..e69de29b From f46847496d5928ca6a1753cd0c32c30a99bb1fe9 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:24:32 +0400 Subject: [PATCH 107/479] Initial compat module --- html2text/compat.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 html2text/compat.py diff --git a/html2text/compat.py b/html2text/compat.py new file mode 100644 index 00000000..e69de29b From c8571dca2db494ba9f0e49187c376a7006a9d8a3 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:28:43 +0400 Subject: [PATCH 108/479] Bring python compatibility to compat module ;) --- html2text/__init__.py | 14 +------------- html2text/compat.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 5b32a39e..56c02e32 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -11,19 +11,7 @@ # TODO: # Support decoded entities with UNIFIABLE. -try: - import htmlentitydefs - import urlparse - import HTMLParser -except ImportError: # Python3 - import html.entities as htmlentitydefs - import urllib.parse as urlparse - import html.parser as HTMLParser -try: # Python3 - import urllib.request as urllib -except ImportError: - import urllib -import optparse + import re import sys diff --git a/html2text/compat.py b/html2text/compat.py index e69de29b..4bb4824d 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -0,0 +1,12 @@ +try: + import htmlentitydefs + import urlparse + import HTMLParser +except ImportError: # Python3 + import html.entities as htmlentitydefs + import urllib.parse as urlparse + import html.parser as HTMLParser +try: # Python3 + import urllib.request as urllib +except ImportError: + import urllib From e8497a10ec089a2a2aec852cec917530c6d3bc69 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:29:09 +0400 Subject: [PATCH 109/479] Extract cli to cli module --- html2text/__init__.py | 144 +---------------------------------------- html2text/cli.py | 146 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 141 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 56c02e32..707501aa 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -2,6 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division +from html2text.compat import htmlentitydefs, urlparse, HTMLParser from html2text import config @@ -949,146 +950,7 @@ def escape_md_section(text, snob=False): return text -def main(): - baseurl = '' - - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', - version='%prog ' + __version__) - p.add_option( - "--ignore-emphasis", - dest="ignore_emphasis", - action="store_true", - default=config.IGNORE_EMPHASIS, - help="don't include any formatting for emphasis" - ) - p.add_option( - "--ignore-links", - dest="ignore_links", - action="store_true", - default=config.IGNORE_ANCHORS, - help="don't include any formatting for links") - p.add_option( - "--ignore-images", - dest="ignore_images", - action="store_true", - default=config.IGNORE_IMAGES, - help="don't include any formatting for images" - ) - p.add_option( - "-g", "--google-doc", - action="store_true", - dest="google_doc", - default=False, - help="convert an html-exported Google Document" - ) - p.add_option( - "-d", "--dash-unordered-list", - action="store_true", - dest="ul_style_dash", - default=False, - help="use a dash rather than a star for unordered list items" - ) - p.add_option( - "-e", "--asterisk-emphasis", - action="store_true", - dest="em_style_asterisk", - default=False, - help="use an asterisk rather than an underscore for emphasized text" - ) - p.add_option( - "-b", "--body-width", - dest="body_width", - action="store", - type="int", - default=config.BODY_WIDTH, - help="number of characters per output line, 0 for no wrap" - ) - p.add_option( - "-i", "--google-list-indent", - dest="list_indent", - action="store", - type="int", - default=config.GOOGLE_LIST_INDENT, - help="number of pixels Google indents nested lists" - ) - p.add_option( - "-s", "--hide-strikethrough", - action="store_true", - dest="hide_strikethrough", - default=False, - help="hide strike-through text. only relevant when -g is " - "specified as well" - ) - p.add_option( - "--escape-all", - action="store_true", - dest="escape_snob", - default=False, - help="Escape all special characters. Output is less readable, but " - "avoids corner case formatting issues." - ) - p.add_option( - "--bypass-tables", - action="store_true", - dest="bypass_tables", - default=config.BYPASS_TABLES, - help="Format tables in HTML rather than Markdown syntax." - ) - (options, args) = p.parse_args() - - # process input - encoding = "utf-8" - if len(args) > 0: - file_ = args[0] - if len(args) == 2: - encoding = args[1] - if len(args) > 2: - p.error('Too many arguments') - - if file_.startswith('http://') or file_.startswith('https://'): - baseurl = file_ - j = urllib.urlopen(baseurl) - data = j.read() - if encoding is None: - try: - from feedparser import _getCharacterEncoding as enc - except ImportError: - enc = lambda x, y: ('utf-8', 1) - encoding = enc(j.headers, data)[0] - if encoding == 'us-ascii': - encoding = 'utf-8' - else: - data = open(file_, 'rb').read() - if encoding is None: - try: - from chardet import detect - except ImportError: - detect = lambda x: {'encoding': 'utf-8'} - encoding = detect(data)['encoding'] - else: - data = sys.stdin.read() - - data = data.decode(encoding) - h = HTML2Text(baseurl=baseurl) - # handle options - if options.ul_style_dash: - h.ul_item_mark = '-' - if options.em_style_asterisk: - h.emphasis_mark = '*' - h.strong_mark = '__' - - h.body_width = options.body_width - h.list_indent = options.list_indent - h.ignore_emphasis = options.ignore_emphasis - h.ignore_links = options.ignore_links - h.ignore_images = options.ignore_images - h.google_doc = options.google_doc - h.hide_strikethrough = options.hide_strikethrough - h.escape_snob = options.escape_snob - h.bypass_tables = options.bypass_tables - - wrapwrite(h.handle(data)) - - if __name__ == "__main__": + from html2text.cli import main + main() diff --git a/html2text/cli.py b/html2text/cli.py index e69de29b..722055cf 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -0,0 +1,146 @@ +import optparse +import sys + +from html2text.compat import urllib +from html2text import HTML2Text, wrapwrite, config, __version__ + + +def main(): + baseurl = '' + + p = optparse.OptionParser('%prog [(filename|url) [encoding]]', + version='%prog ' + __version__) + p.add_option( + "--ignore-emphasis", + dest="ignore_emphasis", + action="store_true", + default=config.IGNORE_EMPHASIS, + help="don't include any formatting for emphasis" + ) + p.add_option( + "--ignore-links", + dest="ignore_links", + action="store_true", + default=config.IGNORE_ANCHORS, + help="don't include any formatting for links") + p.add_option( + "--ignore-images", + dest="ignore_images", + action="store_true", + default=config.IGNORE_IMAGES, + help="don't include any formatting for images" + ) + p.add_option( + "-g", "--google-doc", + action="store_true", + dest="google_doc", + default=False, + help="convert an html-exported Google Document" + ) + p.add_option( + "-d", "--dash-unordered-list", + action="store_true", + dest="ul_style_dash", + default=False, + help="use a dash rather than a star for unordered list items" + ) + p.add_option( + "-e", "--asterisk-emphasis", + action="store_true", + dest="em_style_asterisk", + default=False, + help="use an asterisk rather than an underscore for emphasized text" + ) + p.add_option( + "-b", "--body-width", + dest="body_width", + action="store", + type="int", + default=config.BODY_WIDTH, + help="number of characters per output line, 0 for no wrap" + ) + p.add_option( + "-i", "--google-list-indent", + dest="list_indent", + action="store", + type="int", + default=config.GOOGLE_LIST_INDENT, + help="number of pixels Google indents nested lists" + ) + p.add_option( + "-s", "--hide-strikethrough", + action="store_true", + dest="hide_strikethrough", + default=False, + help="hide strike-through text. only relevant when -g is " + "specified as well" + ) + p.add_option( + "--escape-all", + action="store_true", + dest="escape_snob", + default=False, + help="Escape all special characters. Output is less readable, but " + "avoids corner case formatting issues." + ) + p.add_option( + "--bypass-tables", + action="store_true", + dest="bypass_tables", + default=config.BYPASS_TABLES, + help="Format tables in HTML rather than Markdown syntax." + ) + (options, args) = p.parse_args() + + # process input + encoding = "utf-8" + if len(args) > 0: + file_ = args[0] + if len(args) == 2: + encoding = args[1] + if len(args) > 2: + p.error('Too many arguments') + + if file_.startswith('http://') or file_.startswith('https://'): + baseurl = file_ + j = urllib.urlopen(baseurl) + data = j.read() + if encoding is None: + try: + from feedparser import _getCharacterEncoding as enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + encoding = enc(j.headers, data)[0] + if encoding == 'us-ascii': + encoding = 'utf-8' + else: + data = open(file_, 'rb').read() + if encoding is None: + try: + from chardet import detect + except ImportError: + detect = lambda x: {'encoding': 'utf-8'} + encoding = detect(data)['encoding'] + else: + data = sys.stdin.read() + + data = data.decode(encoding) + h = HTML2Text(baseurl=baseurl) + # handle options + if options.ul_style_dash: + h.ul_item_mark = '-' + if options.em_style_asterisk: + h.emphasis_mark = '*' + h.strong_mark = '__' + + h.body_width = options.body_width + h.list_indent = options.list_indent + h.ignore_emphasis = options.ignore_emphasis + h.ignore_links = options.ignore_links + h.ignore_images = options.ignore_images + h.google_doc = options.google_doc + h.hide_strikethrough = options.hide_strikethrough + h.escape_snob = options.escape_snob + h.bypass_tables = options.bypass_tables + + wrapwrite(h.handle(data)) \ No newline at end of file From 9227623330da774742234759c5aa77c14aca07ab Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:30:47 +0400 Subject: [PATCH 110/479] points to html2text=html2text.cli:main --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 22244975..e8d939f5 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ def run(self): ], entry_points=""" [console_scripts] - html2text=html2text:main + html2text=html2text.cli:main """, license='GNU GPL 3', requires=requires_list, From 5837bdfd2ce9fb1199bd851d20bb30ddcfdfdfde Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:32:12 +0400 Subject: [PATCH 111/479] Clean up imports a little bit --- html2text/__init__.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 707501aa..d1540fab 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -2,6 +2,14 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division +import re +import sys + +try: + from textwrap import wrap +except ImportError: + pass + from html2text.compat import htmlentitydefs, urlparse, HTMLParser from html2text import config @@ -13,15 +21,6 @@ # Support decoded entities with UNIFIABLE. -import re -import sys - -try: - from textwrap import wrap -except ImportError: - pass - - def name2cp(k): if k == 'apos': return ord("'") From 334fdd6e4b7797d018744a671c0b5c47b1ad73b1 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:33:14 +0400 Subject: [PATCH 112/479] Syntax fix in ChangeLog.rst --- ChangeLog.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index e4d65aa7..fcd6ab4a 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,5 +1,5 @@ 0000.00.00 - 0000-00-00 -===================== +======================= ---- * Feature: Update `README.md` with usage examples. @@ -8,7 +8,7 @@ 2014.9.25 - 2014-09-25 -===================== +====================== ---- * Feature #29, #27: Add simple table support with bypass option. From 1d56f0ed351a870b3fa469facd7c28359be894c4 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 19:36:33 +0400 Subject: [PATCH 113/479] Update changelog for new cool changes --- ChangeLog.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index fcd6ab4a..1fddf9eb 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,6 +5,11 @@ * Feature: Update `README.md` with usage examples. * Fix #35: Remove `py_modules` from `setup.py`. * Fix #36: Excludes tests from being installed as a separate module. +* Fix #37: Don't hardcode the path to the installed binary. +* Fix: Readme typo in running cli. +* Feature #40: Extract cli part to ``cli`` module. +* Feature #42: Bring python version compatibility to ``compat.py`` module. + 2014.9.25 - 2014-09-25 From 54ee07cba6ca3175faf30b20ffa08aa283ef00cc Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:04:15 +0400 Subject: [PATCH 114/479] Initial utils module --- html2text/utils.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 html2text/utils.py diff --git a/html2text/utils.py b/html2text/utils.py new file mode 100644 index 00000000..e69de29b From a93c9366e5934d617788ce9014833490b88fdf85 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:10:59 +0400 Subject: [PATCH 115/479] Remov utility/helpers method to utils module --- html2text/__init__.py | 247 +++--------------------------------------- html2text/utils.py | 241 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 256 insertions(+), 232 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index d1540fab..3deb7c0b 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -3,17 +3,29 @@ """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division import re -import sys - try: from textwrap import wrap except ImportError: pass +from html2text.utils import ( + name2cp, + unifiable_n, + google_text_emphasis, + google_fixed_width_font, + element_style, + hn, + google_has_height, + escape_md, + google_list_style, + list_numbering_start, + dumb_css_parser, + escape_md_section, + skipwrap +) from html2text.compat import htmlentitydefs, urlparse, HTMLParser from html2text import config - __version__ = "2014.9.25" @@ -21,163 +33,6 @@ # Support decoded entities with UNIFIABLE. -def name2cp(k): - if k == 'apos': - return ord("'") - return htmlentitydefs.name2codepoint[k] - -unifiable_n = {} - -for k in config.UNIFIABLE.keys(): - unifiable_n[name2cp(k)] = config.UNIFIABLE[k] - - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): - return n - except ValueError: - return 0 - - -def dumb_property_dict(style): - """ - :returns: A hash of css attributes - """ - out = dict([(x.strip(), y.strip()) for x, y in - [z.split(':', 1) for z in - style.split(';') if ':' in z]]) - return out - - -def dumb_css_parser(data): - """ - :type data: str - - :returns: A hash of css selectors, each of which contains a hash of - css attributes. - :rtype: dict - """ - # remove @import sentences - data += ';' - importIndex = data.find('@import') - while importIndex != -1: - data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] - importIndex = data.find('@import') - - # parse the css. reverted from dictionary comprehension in order to - # support older pythons - elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] - try: - elements = dict([(a.strip(), dumb_property_dict(b)) - for a, b in elements]) - except ValueError: - elements = {} # not that important - - return elements - - -def element_style(attrs, style_def, parent_style): - """ - :type attrs: dict - :type style_def: dict - :type style_def: dict - - :returns: A hash of the 'final' style attributes of the element - :rtype: dict - """ - style = parent_style.copy() - if 'class' in attrs: - for css_class in attrs['class'].split(): - css_style = style_def['.' + css_class] - style.update(css_style) - if 'style' in attrs: - immediate_style = dumb_property_dict(attrs['style']) - style.update(immediate_style) - return style - - -def google_list_style(style): - """ - Finds out whether this is an ordered or unordered list - - :type style: dict - - :rtype: str - """ - if 'list-style-type' in style: - list_style = style['list-style-type'] - if list_style in ['disc', 'circle', 'square', 'none']: - return 'ul' - return 'ol' - - -def google_has_height(style): - """ - Check if the style of the element has the 'height' attribute - explicitly defined - - :type style: dict - - :rtype: bool - """ - if 'height' in style: - return True - return False - - -def google_text_emphasis(style): - """ - :type style: dict - - :returns: A list of all emphasis modifiers of the element - :rtype: list - """ - emphasis = [] - if 'text-decoration' in style: - emphasis.append(style['text-decoration']) - if 'font-style' in style: - emphasis.append(style['font-style']) - if 'font-weight' in style: - emphasis.append(style['font-weight']) - return emphasis - - -def google_fixed_width_font(style): - """ - Check if the css of the current element defines a fixed width font - - :type style: dict - - :rtype: bool - """ - font_family = '' - if 'font-family' in style: - font_family = style['font-family'] - if 'Courier New' == font_family or 'Consolas' == font_family: - return True - return False - - -def list_numbering_start(attrs): - """ - Extract numbering from list element attributes - - :type attrs: dict - - :rtype: int or None - """ - if 'start' in attrs: - try: - return int(attrs['start']) - 1 - except ValueError: - pass - - return 0 - - class HTML2Text(HTMLParser.HTMLParser): def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): """ @@ -876,78 +731,6 @@ def optwrap(self, text): return result -def skipwrap(para): - # If the text begins with four spaces or one tab, it's a code block; - # don't wrap - if para[0:4] == ' ' or para[0] == '\t': - return True - - # If the text begins with only two "--", possibly preceded by - # whitespace, that's an emdash; so wrap. - stripped = para.lstrip() - if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": - return False - - # I'm not sure what this is for; I thought it was to detect lists, - # but there's a
      -inside- case in one of the tests that - # also depends upon it. - if stripped[0:1] == '-' or stripped[0:1] == '*': - return True - - # If the text begins with a single -, *, or +, followed by a space, - # or an integer, followed by a ., followed by a space (in either - # case optionally proceeded by whitespace), it's a list; don't wrap. - if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \ - config.RE_UNORDERED_LIST_MATCHER.match(stripped): - return True - - return False - - -def wrapwrite(text): - text = text.encode('utf-8') - try: # Python3 - sys.stdout.buffer.write(text) - except AttributeError: - sys.stdout.write(text) - - -def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): - h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) - - return h.handle(html) - - -def unescape(s, unicode_snob=False): - h = HTML2Text() - h.unicode_snob = unicode_snob - - return h.unescape(s) - - -def escape_md(text): - """ - Escapes markdown-sensitive characters within other markdown - constructs. - """ - return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) - - -def escape_md_section(text, snob=False): - """ - Escapes markdown-sensitive characters across whole document sections. - """ - text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) - - if snob: - text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) - - text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) - text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) - text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) - - return text - if __name__ == "__main__": from html2text.cli import main diff --git a/html2text/utils.py b/html2text/utils.py index e69de29b..5d5afd62 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -0,0 +1,241 @@ +import sys + +from html2text.compat import htmlentitydefs +from html2text import config, HTML2Text + + +def name2cp(k): + if k == 'apos': + return ord("'") + return htmlentitydefs.name2codepoint[k] + + +unifiable_n = {} + +for k in config.UNIFIABLE.keys(): + unifiable_n[name2cp(k)] = config.UNIFIABLE[k] + + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): + return n + except ValueError: + return 0 + + +def dumb_property_dict(style): + """ + :returns: A hash of css attributes + """ + out = dict([(x.strip(), y.strip()) for x, y in + [z.split(':', 1) for z in + style.split(';') if ':' in z]]) + + return out + + +def dumb_css_parser(data): + """ + :type data: str + + :returns: A hash of css selectors, each of which contains a hash of + css attributes. + :rtype: dict + """ + # remove @import sentences + data += ';' + importIndex = data.find('@import') + while importIndex != -1: + data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] + importIndex = data.find('@import') + + # parse the css. reverted from dictionary comprehension in order to + # support older pythons + elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] + try: + elements = dict([(a.strip(), dumb_property_dict(b)) + for a, b in elements]) + except ValueError: + elements = {} # not that important + + return elements + + +def element_style(attrs, style_def, parent_style): + """ + :type attrs: dict + :type style_def: dict + :type style_def: dict + + :returns: A hash of the 'final' style attributes of the element + :rtype: dict + """ + style = parent_style.copy() + if 'class' in attrs: + for css_class in attrs['class'].split(): + css_style = style_def['.' + css_class] + style.update(css_style) + if 'style' in attrs: + immediate_style = dumb_property_dict(attrs['style']) + style.update(immediate_style) + + return style + + +def google_list_style(style): + """ + Finds out whether this is an ordered or unordered list + + :type style: dict + + :rtype: str + """ + if 'list-style-type' in style: + list_style = style['list-style-type'] + if list_style in ['disc', 'circle', 'square', 'none']: + return 'ul' + + return 'ol' + + +def google_has_height(style): + """ + Check if the style of the element has the 'height' attribute + explicitly defined + + :type style: dict + + :rtype: bool + """ + if 'height' in style: + return True + + return False + + +def google_text_emphasis(style): + """ + :type style: dict + + :returns: A list of all emphasis modifiers of the element + :rtype: list + """ + emphasis = [] + if 'text-decoration' in style: + emphasis.append(style['text-decoration']) + if 'font-style' in style: + emphasis.append(style['font-style']) + if 'font-weight' in style: + emphasis.append(style['font-weight']) + + return emphasis + + +def google_fixed_width_font(style): + """ + Check if the css of the current element defines a fixed width font + + :type style: dict + + :rtype: bool + """ + font_family = '' + if 'font-family' in style: + font_family = style['font-family'] + if 'Courier New' == font_family or 'Consolas' == font_family: + return True + + return False + + +def list_numbering_start(attrs): + """ + Extract numbering from list element attributes + + :type attrs: dict + + :rtype: int or None + """ + if 'start' in attrs: + try: + return int(attrs['start']) - 1 + except ValueError: + pass + + return 0 + + +def skipwrap(para): + # If the text begins with four spaces or one tab, it's a code block; + # don't wrap + if para[0:4] == ' ' or para[0] == '\t': + return True + + # If the text begins with only two "--", possibly preceded by + # whitespace, that's an emdash; so wrap. + stripped = para.lstrip() + if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": + return False + + # I'm not sure what this is for; I thought it was to detect lists, + # but there's a
      -inside- case in one of the tests that + # also depends upon it. + if stripped[0:1] == '-' or stripped[0:1] == '*': + return True + + # If the text begins with a single -, *, or +, followed by a space, + # or an integer, followed by a ., followed by a space (in either + # case optionally proceeded by whitespace), it's a list; don't wrap. + if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \ + config.RE_UNORDERED_LIST_MATCHER.match(stripped): + return True + + return False + + +def wrapwrite(text): + text = text.encode('utf-8') + try: # Python3 + sys.stdout.buffer.write(text) + except AttributeError: + sys.stdout.write(text) + + +def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): + h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) + + return h.handle(html) + + +def unescape(s, unicode_snob=False): + h = HTML2Text() + h.unicode_snob = unicode_snob + + return h.unescape(s) + + +def escape_md(text): + """ + Escapes markdown-sensitive characters within other markdown + constructs. + """ + return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) + + +def escape_md_section(text, snob=False): + """ + Escapes markdown-sensitive characters across whole document sections. + """ + text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) + + if snob: + text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) + + text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) + + return text From 114f72371dcb66b9ccd3c39daefe43a2c28f5860 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:24:32 +0400 Subject: [PATCH 116/479] Bring back html2text, unescape to init to prevent import conflicts in utils --- html2text/__init__.py | 13 +++++++++++++ html2text/utils.py | 15 +-------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 3deb7c0b..9fce5eb4 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -731,6 +731,19 @@ def optwrap(self, text): return result +def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): + h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) + + return h.handle(html) + + +def unescape(s, unicode_snob=False): + h = HTML2Text() + h.unicode_snob = unicode_snob + + return h.unescape(s) + + if __name__ == "__main__": from html2text.cli import main diff --git a/html2text/utils.py b/html2text/utils.py index 5d5afd62..1ab60b9c 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -1,7 +1,7 @@ import sys +from html2text import config from html2text.compat import htmlentitydefs -from html2text import config, HTML2Text def name2cp(k): @@ -204,19 +204,6 @@ def wrapwrite(text): sys.stdout.write(text) -def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): - h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) - - return h.handle(html) - - -def unescape(s, unicode_snob=False): - h = HTML2Text() - h.unicode_snob = unicode_snob - - return h.unescape(s) - - def escape_md(text): """ Escapes markdown-sensitive characters within other markdown From 05319fba3c7598511993d8717098d0f230606c6b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:24:49 +0400 Subject: [PATCH 117/479] Fix importing from utils --- html2text/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index 722055cf..cd4ff073 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -2,7 +2,8 @@ import sys from html2text.compat import urllib -from html2text import HTML2Text, wrapwrite, config, __version__ +from html2text import HTML2Text, config, __version__ +from html2text.utils import wrapwrite def main(): From 4a7d3791397679a612ac3fd6d2c4145c62d9e780 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:25:02 +0400 Subject: [PATCH 118/479] A little bit of import optimization --- html2text/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 9fce5eb4..35a8eebd 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -8,9 +8,12 @@ except ImportError: pass +from html2text.compat import htmlentitydefs, urlparse, HTMLParser +from html2text import config + from html2text.utils import ( name2cp, - unifiable_n, + unifiable_n, google_text_emphasis, google_fixed_width_font, element_style, @@ -23,8 +26,6 @@ escape_md_section, skipwrap ) -from html2text.compat import htmlentitydefs, urlparse, HTMLParser -from html2text import config __version__ = "2014.9.25" From 2b5762f4f93d659db0083e618bac52508f105a3f Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:26:27 +0400 Subject: [PATCH 119/479] Some tiny code clean up as well --- html2text/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 35a8eebd..35152da4 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -3,6 +3,7 @@ """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division import re + try: from textwrap import wrap except ImportError: @@ -31,7 +32,7 @@ # TODO: -# Support decoded entities with UNIFIABLE. +# Support decoded entities with UNIFIABLE. class HTML2Text(HTMLParser.HTMLParser): @@ -184,9 +185,9 @@ def previousIndex(self, attrs): if ('href' in a) and a['href'] == attrs['href']: if ('title' in a) or ('title' in attrs): - if (('title' in a) and ('title' in attrs) and + if (('title' in a) and ('title' in attrs) and a['title'] == attrs['title']): - match = True + match = True else: match = True @@ -202,7 +203,7 @@ def handle_emphasis(self, start, tag_style, parent_style): # handle Google's text emphasis strikethrough = 'line-through' in \ - tag_emphasis and self.hide_strikethrough + tag_emphasis and self.hide_strikethrough bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis fixed = google_fixed_width_font(tag_style) and not \ @@ -251,7 +252,7 @@ def handle_emphasis(self, start, tag_style, parent_style): self.o(self.emphasis_mark) # space is only allowed after *all* emphasis marks if (bold or italic) and not self.emphasis: - self.o(" ") + self.o(" ") if strikethrough: self.quiet -= 1 @@ -363,7 +364,7 @@ def handle_tag(self, tag, attrs, start): if ('href' in attrs) and \ (attrs['href'] is not None) and \ not (self.skip_internal_links and - attrs['href'].startswith('#')): + attrs['href'].startswith('#')): self.astack.append(attrs) self.maybe_automatic_link = attrs['href'] else: @@ -689,7 +690,7 @@ def google_nest_count(self, style): nest_count = 0 if 'margin-left' in style: nest_count = int(style['margin-left'][:-2]) \ - // self.google_list_indent + // self.google_list_indent return nest_count @@ -745,7 +746,6 @@ def unescape(s, unicode_snob=False): return h.unescape(s) - if __name__ == "__main__": from html2text.cli import main From 21b85f5dd8d68d10da5b15e16693f9eef263e52a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 1 Nov 2014 20:27:49 +0400 Subject: [PATCH 120/479] Update the changelog with latest cool changes --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 1fddf9eb..fda883e8 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -9,7 +9,7 @@ * Fix: Readme typo in running cli. * Feature #40: Extract cli part to ``cli`` module. * Feature #42: Bring python version compatibility to ``compat.py`` module. - +* Feature #41: Extract utility/helper methods to ``utils`` module. 2014.9.25 - 2014-09-25 From ca99d55a915aaff904562efb815debb905840ac6 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 3 Dec 2014 17:54:44 +0400 Subject: [PATCH 121/479] Add wrap_read --- html2text/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/html2text/utils.py b/html2text/utils.py index 1ab60b9c..0946930f 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -204,6 +204,16 @@ def wrapwrite(text): sys.stdout.write(text) +def wrap_read(): + """ + :rtype: str + """ + try: + return sys.stdin.read() + except AttributeError: + return sys.stdin.buffer.read() + + def escape_md(text): """ Escapes markdown-sensitive characters within other markdown From 78f706d4da48823c19264fde0e51248b256ed89c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 3 Dec 2014 17:55:23 +0400 Subject: [PATCH 122/479] Using wrap_read from utils to read stdin --- html2text/cli.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index cd4ff073..eee3c5bc 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -3,7 +3,7 @@ from html2text.compat import urllib from html2text import HTML2Text, config, __version__ -from html2text.utils import wrapwrite +from html2text.utils import wrapwrite, wrap_read def main(): @@ -123,9 +123,11 @@ def main(): detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: - data = sys.stdin.read() + data = wrap_read() + + if hasattr(data, 'decode'): + data = data.decode(encoding) - data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: From 20d1669e6babad7c80180d0b8d11f41012be07f3 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 3 Dec 2014 17:55:32 +0400 Subject: [PATCH 123/479] Remove unused `sys` module --- html2text/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index eee3c5bc..9c27df5e 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,5 +1,4 @@ import optparse -import sys from html2text.compat import urllib from html2text import HTML2Text, config, __version__ From 50d863fcd4d37ef4a615f0f7272249f154b19fd2 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 3 Dec 2014 17:56:28 +0400 Subject: [PATCH 124/479] Update ChangeLog for issue #45 --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index fda883e8..314627a9 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -10,6 +10,7 @@ * Feature #40: Extract cli part to ``cli`` module. * Feature #42: Bring python version compatibility to ``compat.py`` module. * Feature #41: Extract utility/helper methods to ``utils`` module. +* Fix #45: Does not accept standard input when running under Python 3. 2014.9.25 - 2014-09-25 From c3b3de64b273905b68ca7dbf9e579c73e4737a4e Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Dec 2014 22:01:29 +0400 Subject: [PATCH 125/479] Feature: Clean up ``ChangeLog.rst`` for version and date numbers. --- ChangeLog.rst | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 314627a9..35f6a4e3 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -11,25 +11,26 @@ * Feature #42: Bring python version compatibility to ``compat.py`` module. * Feature #41: Extract utility/helper methods to ``utils`` module. * Fix #45: Does not accept standard input when running under Python 3. +* Feature: Clean up ``ChangeLog.rst`` for version and date numbers. -2014.9.25 - 2014-09-25 -====================== +2014.9.25 +========= ---- * Feature #29, #27: Add simple table support with bypass option. * Fix #20: Replace project website with: http://alir3z4.github.io/html2text/ . -2014.9.8 - 2014-09-08 -===================== +2014.9.8 +======== ---- * Fix #28: missing ``html2text`` package in installation. -2014.9.7 - 2014-09-07 -===================== +2014.9.7 +======== ---- * Fix ``unicode``/``type`` error in memory leak unit-test. @@ -41,8 +42,8 @@ * Fix: Fix TypeError when parsing tags like . Fixed in #25. -2014.7.3 - 2014-07-03 -===================== +2014.7.3 +======== ---- * Fix #8: Remove ``How to do a release`` section from README.md. @@ -50,8 +51,8 @@ * Fix #13: memory leak in using ``handle`` while keeping the old instance of ``html2text``. -2014.4.5 - 2014-04-05 -===================== +2014.4.5 +======== ---- * Fix #1: Add ``ChangeLog.rst`` file. From d3ba03b34fad1683a78e1ccc0a38d42b1a61cf4a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Dec 2014 22:02:58 +0400 Subject: [PATCH 126/479] Remove unused module imports --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 35152da4..26d6cb38 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -9,7 +9,7 @@ except ImportError: pass -from html2text.compat import htmlentitydefs, urlparse, HTMLParser +from html2text.compat import urlparse, HTMLParser from html2text import config from html2text.utils import ( From 100ae75f7aee95e88f379fc52eafbf9d33ee1a34 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Dec 2014 22:03:18 +0400 Subject: [PATCH 127/479] Bump version number for new release 2014.12.5 --- ChangeLog.rst | 4 ++-- html2text/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 35f6a4e3..1312fb14 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,5 +1,5 @@ -0000.00.00 - 0000-00-00 -======================= +2014.12.5 +========= ---- * Feature: Update `README.md` with usage examples. diff --git a/html2text/__init__.py b/html2text/__init__.py index 26d6cb38..d62cbcb7 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -28,7 +28,7 @@ skipwrap ) -__version__ = "2014.9.25" +__version__ = "2014.12.5" # TODO: diff --git a/setup.py b/setup.py index e8d939f5..492d6652 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.9.25", + version="2014.12.5", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From c5d59f7302bb04800679df8d5994120a8aa67c55 Mon Sep 17 00:00:00 2001 From: Jocelyn Delalande Date: Mon, 22 Dec 2014 16:53:12 +0100 Subject: [PATCH 128/479] added a images_to_alt option to discard images and keep only their alt --- html2text/__init__.py | 33 ++++++++++++++++++++------------- html2text/cli.py | 10 +++++++++- html2text/config.py | 3 ++- test/images_to_alt.html | 3 +++ test/images_to_alt.md | 2 ++ test/test_html2text.py | 4 ++++ 6 files changed, 40 insertions(+), 15 deletions(-) create mode 100644 test/images_to_alt.html create mode 100644 test/images_to_alt.md diff --git a/html2text/__init__.py b/html2text/__init__.py index d62cbcb7..116e690a 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -58,6 +58,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.google_list_indent = config.GOOGLE_LIST_INDENT self.ignore_links = config.IGNORE_ANCHORS self.ignore_images = config.IGNORE_IMAGES + self.images_to_alt = config.IMAGES_TO_ALT self.ignore_emphasis = config.IGNORE_EMPHASIS self.bypass_tables = config.BYPASS_TABLES self.google_doc = False @@ -390,23 +391,29 @@ def handle_tag(self, tag, attrs, start): if tag == "img" and start and not self.ignore_images: if 'src' in attrs: - attrs['href'] = attrs['src'] + if not self.images_to_alt: + attrs['href'] = attrs['src'] alt = attrs.get('alt') or '' - self.o("![" + escape_md(alt) + "]") - if self.inline_links: - href = attrs.get('href') or '' - self.o("(" + escape_md(href) + ")") + # If we have images_to_alt, we discard the image itself, + # considering only the alt text. + if self.images_to_alt: + self.o(escape_md(alt)) else: - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] + self.o("![" + escape_md(alt) + "]") + if self.inline_links: + href = attrs.get('href') or '' + self.o("(" + escape_md(href) + ")") else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("[" + str(attrs['count']) + "]") + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("[" + str(attrs['count']) + "]") if tag == 'dl' and start: self.p() diff --git a/html2text/cli.py b/html2text/cli.py index 9c27df5e..76072685 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -30,6 +30,13 @@ def main(): default=config.IGNORE_IMAGES, help="don't include any formatting for images" ) + p.add_option( + "--images-to-alt", + dest="images_to_alt", + action="store_true", + default=config.IMAGES_TO_ALT, + help="Discard image data, only keep alt text" + ) p.add_option( "-g", "--google-doc", action="store_true", @@ -140,9 +147,10 @@ def main(): h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.ignore_images = options.ignore_images + h.images_to_alt = options.images_to_alt h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables - wrapwrite(h.handle(data)) \ No newline at end of file + wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index f932024d..9268555d 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -25,6 +25,7 @@ IGNORE_ANCHORS = False IGNORE_IMAGES = False +IMAGES_TO_ALT = False IGNORE_EMPHASIS = False # For checking space-only lines on line 771 @@ -102,4 +103,4 @@ 'rlm': '' } -BYPASS_TABLES = False \ No newline at end of file +BYPASS_TABLES = False diff --git a/test/images_to_alt.html b/test/images_to_alt.html new file mode 100644 index 00000000..b41b6928 --- /dev/null +++ b/test/images_to_alt.html @@ -0,0 +1,3 @@ + +ALT TEXT + diff --git a/test/images_to_alt.md b/test/images_to_alt.md new file mode 100644 index 00000000..2dd2299b --- /dev/null +++ b/test/images_to_alt.md @@ -0,0 +1,2 @@ +[ ALT TEXT ](http://example.com) + diff --git a/test/test_html2text.py b/test/test_html2text.py index 2ea839d1..fe20e839 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -127,6 +127,10 @@ def test_cmd(self): module_args['body_width'] = 0 cmdline_args.append('--body-width=0') + if base_fn.startswith('images_to_alt'): + module_args['images_to_alt'] = True + cmdline_args.append('--images-to-alt') + return test_mod, test_cmd # Originally from http://stackoverflow.com/questions/32899/\ From 22b29279c54463a0b3056ca3dc9165f336ba3215 Mon Sep 17 00:00:00 2001 From: Jocelyn Delalande Date: Mon, 22 Dec 2014 17:16:58 +0100 Subject: [PATCH 129/479] Protect links, surrounding them with angle brackets to avoid breaking them with line-wrap (inline and reference). --- html2text/__init__.py | 3 +++ html2text/cli.py | 9 ++++++++- html2text/config.py | 6 +++++- test/protect_links.html | 1 + test/protect_links.md | 3 +++ test/test_html2text.py | 4 ++++ 6 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 test/protect_links.html create mode 100644 test/protect_links.md diff --git a/html2text/__init__.py b/html2text/__init__.py index d62cbcb7..1d0575dd 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -55,6 +55,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.body_width = bodywidth self.skip_internal_links = config.SKIP_INTERNAL_LINKS self.inline_links = config.INLINE_LINKS + self.protect_links = config.PROTECT_LINKS self.google_list_indent = config.GOOGLE_LIST_INDENT self.ignore_links = config.IGNORE_ANCHORS self.ignore_images = config.IGNORE_IMAGES @@ -367,6 +368,8 @@ def handle_tag(self, tag, attrs, start): attrs['href'].startswith('#')): self.astack.append(attrs) self.maybe_automatic_link = attrs['href'] + if self.protect_links: + attrs['href'] = '<'+attrs['href']+'>' else: self.astack.append(None) else: diff --git a/html2text/cli.py b/html2text/cli.py index 9c27df5e..5d238f18 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -23,6 +23,12 @@ def main(): action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") + p.add_option( + "--protect-links", + dest="protect_links", + action="store_true", + default=config.PROTECT_LINKS, + help="protect links from line breaks surrounding them with angle brackets") p.add_option( "--ignore-images", dest="ignore_images", @@ -139,10 +145,11 @@ def main(): h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links + h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables - wrapwrite(h.handle(data)) \ No newline at end of file + wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index f932024d..719a7725 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -20,6 +20,10 @@ # Use inline, rather than reference, formatting for images and links INLINE_LINKS = True +# Protect links from line breaks surrounding them with angle brackets (in +# addition to their square brackets) +PROTECT_LINKS = False + # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 @@ -102,4 +106,4 @@ 'rlm': '' } -BYPASS_TABLES = False \ No newline at end of file +BYPASS_TABLES = False diff --git a/test/protect_links.html b/test/protect_links.html new file mode 100644 index 00000000..b248d1ee --- /dev/null +++ b/test/protect_links.html @@ -0,0 +1 @@ +foo \ No newline at end of file diff --git a/test/protect_links.md b/test/protect_links.md new file mode 100644 index 00000000..23e153e9 --- /dev/null +++ b/test/protect_links.md @@ -0,0 +1,3 @@ +[foo]() + diff --git a/test/test_html2text.py b/test/test_html2text.py index 2ea839d1..811a672e 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -127,6 +127,10 @@ def test_cmd(self): module_args['body_width'] = 0 cmdline_args.append('--body-width=0') + if base_fn.startswith('protect_links'): + module_args['protect_links'] = True + cmdline_args.append('--protect-links') + return test_mod, test_cmd # Originally from http://stackoverflow.com/questions/32899/\ From 0fc58fe9af1efb11c8f79f9b446ed4881fb91dab Mon Sep 17 00:00:00 2001 From: Jocelyn Delalande Date: Tue, 23 Dec 2014 10:08:55 +0100 Subject: [PATCH 130/479] pep8 --- html2text/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index 5d238f18..bbf77c38 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -28,7 +28,8 @@ def main(): dest="protect_links", action="store_true", default=config.PROTECT_LINKS, - help="protect links from line breaks surrounding them with angle brackets") + help=("protect links from line breaks surrounding them "+ + "with angle brackets")) p.add_option( "--ignore-images", dest="ignore_images", From c63d377a62df7b7302ffd9b8eede921d36966b62 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 24 Dec 2014 23:40:43 +0400 Subject: [PATCH 131/479] Update changelog for due to latest changes --- ChangeLog.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 1312fb14..dcd9c73f 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,13 @@ +2014.12.24 +========== +---- + +* Feature #49: Added a images_to_alt option to discard images and keep only +their alt. +* Feature #50: Protect links, surrounding them with angle brackets to avoid +breaking... + + 2014.12.5 ========= ---- From f0535caa4c27c1c55137c45350f9cfddec02c612 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 24 Dec 2014 23:42:37 +0400 Subject: [PATCH 132/479] Add @Alir3z4 to contributors --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index e4e37c3b..9722ea52 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -7,6 +7,7 @@ The AUTHORS/Contributors are (and/or have been): * Alex Musayev * Matěj Cepl * Stefano Rivera + * Alireza Savand * Ivan Gromov Maintainer: From dacc0fd1fb61fd32b57fc04c88f2001ff6db9699 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 24 Dec 2014 23:43:01 +0400 Subject: [PATCH 133/479] Add Jocelyn Delalande to Authors Thanks @JocelynDelalande --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 9722ea52..6e0f29af 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -9,6 +9,7 @@ The AUTHORS/Contributors are (and/or have been): * Stefano Rivera * Alireza Savand * Ivan Gromov + * Jocelyn Delalande Maintainer: From 26832bb33e8c9701562a867e5e3a0da8e703f80b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 24 Dec 2014 23:44:38 +0400 Subject: [PATCH 134/479] Bump version number --- html2text/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 650d48d3..71e22418 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -28,7 +28,7 @@ skipwrap ) -__version__ = "2014.12.5" +__version__ = "2014.12.24" # TODO: diff --git a/setup.py b/setup.py index 492d6652..bb29c7f1 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.12.5", + version="2014.12.24", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 9077c233155fc41b1a9599cebc63b91aea1cbef3 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 24 Dec 2014 23:49:08 +0400 Subject: [PATCH 135/479] Fix md syntax --- ChangeLog.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index dcd9c73f..a41d93bb 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,10 +2,8 @@ ========== ---- -* Feature #49: Added a images_to_alt option to discard images and keep only -their alt. -* Feature #50: Protect links, surrounding them with angle brackets to avoid -breaking... +* Feature #49: Added a images_to_alt option to discard images and keep only their alt. +* Feature #50: Protect links, surrounding them with angle brackets to avoid breaking... 2014.12.5 From a928faaf7d4c08db2911c7b0df337c7361151206 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Dec 2014 00:07:13 +0400 Subject: [PATCH 136/479] Update readme options --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed156faa..a03eb1d7 100644 --- a/README.md +++ b/README.md @@ -17,11 +17,13 @@ Usage: `html2text [(filename|url) [encoding]]` | Option | Description -|--------------------------------------------------------|-------------------------------------------------- +|--------------------------------------------------------|--------------------------------------------------- | `--version` | Show program's version number and exit | `-h`, `--help` | Show this help message and exit | `--ignore-links` | Don't include any formatting for links +|`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets |`--ignore-images` | Don't include any formatting for images +|`--images-to-alt` | Discard image data, only keep alt text |`-g`, `--google-doc` | Convert an html-exported Google Document |`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items |`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap From 125ececbdd9946b63a63f84508128ff6e43514d5 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Dec 2014 00:12:38 +0400 Subject: [PATCH 137/479] Add setup.cfg --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..b88034e4 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md From 75d10523341eabf638de7ad4e1e53b8508ec4382 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 25 Dec 2014 00:18:26 +0400 Subject: [PATCH 138/479] Update changelog for setup.cfg --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index a41d93bb..ed6b0e18 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,6 +4,7 @@ * Feature #49: Added a images_to_alt option to discard images and keep only their alt. * Feature #50: Protect links, surrounding them with angle brackets to avoid breaking... +* Feature: Add ``setup.cfg`` file. 2014.12.5 From 669e667372dd927f523397f4efcf4f76cb1c8691 Mon Sep 17 00:00:00 2001 From: Matt Dorn Date: Sat, 27 Dec 2014 23:48:52 -0600 Subject: [PATCH 139/479] Add single line break configuration (use case: Evernote) --- html2text/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 71e22418..cfd43f72 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -66,6 +66,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.ul_item_mark = '*' self.emphasis_mark = '_' self.strong_mark = '**' + self.single_line_break = False if out is None: self.out = self.outtextf @@ -517,7 +518,7 @@ def pbr(self): self.p_p = 1 def p(self): - self.p_p = 2 + self.p_p = 1 if self.single_line_break else 2 def soft_br(self): self.pbr() From 137789b687fa2543bfa120917a648c3df2d8e7db Mon Sep 17 00:00:00 2001 From: Matt Dorn Date: Sun, 28 Dec 2014 20:27:05 -0600 Subject: [PATCH 140/479] Add config and command line flags for single line break option. --- html2text/__init__.py | 2 +- html2text/cli.py | 11 +++++++++++ html2text/config.py | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index cfd43f72..99dd8e06 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -66,7 +66,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.ul_item_mark = '*' self.emphasis_mark = '_' self.strong_mark = '**' - self.single_line_break = False + self.single_line_break = config.SINGLE_LINE_BREAK if out is None: self.out = self.outtextf diff --git a/html2text/cli.py b/html2text/cli.py index 46aecd54..ef84e061 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -104,6 +104,16 @@ def main(): default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax." ) + p.add_option( + "--single-line-break", + action="store_true", + dest="single_line_break", + default=config.SINGLE_LINE_BREAK, + help=( + "Use a single line break after a block element rather an two line " + "breaks. NOTE: Requires --body-width=0" + ) + ) (options, args) = p.parse_args() # process input @@ -160,5 +170,6 @@ def main(): h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables + h.single_line_break = options.single_line_break wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index 0ae9eb7a..04a8a359 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -108,3 +108,7 @@ } BYPASS_TABLES = False + +# Use a single line break after a block element rather an two line breaks. +# NOTE: Requires body width setting to be 0. +SINGLE_LINE_BREAK = False From 3b65368257fd211a8e482fbbb51a12099d4893eb Mon Sep 17 00:00:00 2001 From: Matt Dorn Date: Sun, 28 Dec 2014 20:27:24 -0600 Subject: [PATCH 141/479] Add unit test for single line break option. --- test/single_line_break.html | 6 ++++++ test/single_line_break.md | 2 ++ test/test_html2text.py | 6 ++++++ 3 files changed, 14 insertions(+) create mode 100644 test/single_line_break.html create mode 100644 test/single_line_break.md diff --git a/test/single_line_break.html b/test/single_line_break.html new file mode 100644 index 00000000..84ee019d --- /dev/null +++ b/test/single_line_break.html @@ -0,0 +1,6 @@ + + + +

      Hello world.
      +
      And hello html2text.
      + \ No newline at end of file diff --git a/test/single_line_break.md b/test/single_line_break.md new file mode 100644 index 00000000..31fde4fc --- /dev/null +++ b/test/single_line_break.md @@ -0,0 +1,2 @@ +Hello world. +And hello html2text. diff --git a/test/test_html2text.py b/test/test_html2text.py index c7fb2d41..167fab73 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -135,6 +135,12 @@ def test_cmd(self): module_args['images_to_alt'] = True cmdline_args.append('--images-to-alt') + if base_fn.startswith('single_line_break'): + module_args['body_width'] = 0 + cmdline_args.append('--body-width=0') + module_args['single_line_break'] = True + cmdline_args.append('--single-line-break') + return test_mod, test_cmd # Originally from http://stackoverflow.com/questions/32899/\ From 7c8a1057311cbd61ae7e421b90f04c772d7b2d21 Mon Sep 17 00:00:00 2001 From: Matt Dorn Date: Sun, 28 Dec 2014 20:30:25 -0600 Subject: [PATCH 142/479] Fix typo in command line option description --- html2text/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index ef84e061..fb951b12 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -28,7 +28,7 @@ def main(): dest="protect_links", action="store_true", default=config.PROTECT_LINKS, - help=("protect links from line breaks surrounding them "+ + help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option( "--ignore-images", @@ -110,8 +110,8 @@ def main(): dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( - "Use a single line break after a block element rather an two line " - "breaks. NOTE: Requires --body-width=0" + "Use a single line break after a block element rather than two " + "line breaks. NOTE: Requires --body-width=0" ) ) (options, args) = p.parse_args() From 7e90ba8cea167dc4462d6c9595c48a86ad99f2aa Mon Sep 17 00:00:00 2001 From: Matt Dorn Date: Sun, 28 Dec 2014 20:31:36 -0600 Subject: [PATCH 143/479] Add EOF newline --- test/single_line_break.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/single_line_break.html b/test/single_line_break.html index 84ee019d..ec3598e1 100644 --- a/test/single_line_break.html +++ b/test/single_line_break.html @@ -3,4 +3,4 @@
      Hello world.
      And hello html2text.
      -
      \ No newline at end of file + From 8b0f740cf74617bff3a0f453a30e7d42aa65d556 Mon Sep 17 00:00:00 2001 From: Matt Dorn Date: Sun, 28 Dec 2014 20:38:13 -0600 Subject: [PATCH 144/479] Add single-line-break option info to README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a03eb1d7..650ce2b4 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Usage: `html2text [(filename|url) [encoding]]` |`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. +| `--single-line-break` | Use a single line break after a block element rather than two. Or you can use it from within `Python`: From 284df7c9c1911732192180a5c32d861138ca4615 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 29 Dec 2014 11:31:09 +0400 Subject: [PATCH 145/479] Add Matt Dorn @mdorn to Authors/Contributors --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 6e0f29af..89755139 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,6 +10,7 @@ The AUTHORS/Contributors are (and/or have been): * Alireza Savand * Ivan Gromov * Jocelyn Delalande + * Matt Dorn Maintainer: From 9317d6bcb0955993d5af3eeac5ffe0047b1ad6f4 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 29 Dec 2014 11:33:30 +0400 Subject: [PATCH 146/479] Update the changelog for Feature #51 --- ChangeLog.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index ed6b0e18..07155763 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,14 @@ +2014.12.29 +========== +---- + +* Feature #51: Add single line break option. + This feature is useful for ensuring that lots of extra line breaks do not + end up in the resulting Markdown file in situations like Evernote .enex + exports. Note that this only works properly if ``body-width`` is set + to ``0``. + + 2014.12.24 ========== ---- From da9d1904d20742bf97a744c545d361af802e1a58 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 29 Dec 2014 11:34:01 +0400 Subject: [PATCH 147/479] Update version number for Release 2014.12.29 --- html2text/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 99dd8e06..9b127570 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -28,7 +28,7 @@ skipwrap ) -__version__ = "2014.12.24" +__version__ = "2014.12.29" # TODO: diff --git a/setup.py b/setup.py index bb29c7f1..20863f5d 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.12.24", + version="2014.12.29", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 21741517fd4b803e3cf920a6c47a1f25fc1e79a4 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 29 Dec 2014 11:40:15 +0400 Subject: [PATCH 148/479] Fix formatting authors list Fixes #52 --- AUTHORS.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 89755139..5b9e0dd1 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -1,17 +1,17 @@ -``html2text`` was Originally written by Aaron Swartz. +``html2text`` was originally written by Aaron Swartz. The AUTHORS/Contributors are (and/or have been): - * Aaron Swartz - * Yariv Barkan - * Alex Musayev - * Matěj Cepl - * Stefano Rivera - * Alireza Savand - * Ivan Gromov - * Jocelyn Delalande - * Matt Dorn +* Aaron Swartz +* Yariv Barkan +* Alex Musayev +* Matěj Cepl +* Stefano Rivera +* Alireza Savand +* Ivan Gromov +* Jocelyn Delalande +* Matt Dorn Maintainer: - * Alireza Savand +* Alireza Savand From 4185966f42d3090cf8a37bd7cf699a1320412f2d Mon Sep 17 00:00:00 2001 From: Miguel Tavares Date: Tue, 17 Feb 2015 18:34:40 +0000 Subject: [PATCH 149/479] Anchor tags that contain tags or empty text are no longer discarded. Added tests for new cases, modified text for images-to-alt to allow for inline tags. --- html2text/__init__.py | 23 ++++++++++++++++++++++- test/empty-link.html | 6 ++++++ test/empty-link.md | 8 ++++++++ test/images_to_alt.html | 2 ++ test/images_to_alt.md | 3 ++- test/img-tag-with-link.html | 9 +++++++++ test/img-tag-with-link.md | 10 ++++++++++ 7 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 test/empty-link.html create mode 100644 test/empty-link.md create mode 100644 test/img-tag-with-link.html create mode 100644 test/img-tag-with-link.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 9b127570..2ee872fc 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -84,6 +84,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.a = [] self.astack = [] self.maybe_automatic_link = None + self.empty_link = False self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') self.acount = 0 self.list = [] @@ -370,6 +371,7 @@ def handle_tag(self, tag, attrs, start): attrs['href'].startswith('#')): self.astack.append(attrs) self.maybe_automatic_link = attrs['href'] + self.empty_link = True if self.protect_links: attrs['href'] = '<'+attrs['href']+'>' else: @@ -377,9 +379,13 @@ def handle_tag(self, tag, attrs, start): else: if self.astack: a = self.astack.pop() - if self.maybe_automatic_link: + if self.maybe_automatic_link and not self.empty_link: self.maybe_automatic_link = None elif a: + if self.empty_link: + self.o("[") + self.empty_link = False + self.maybe_automatic_link = None if self.inline_links: self.o("](" + escape_md(a['href']) + ")") else: @@ -399,6 +405,19 @@ def handle_tag(self, tag, attrs, start): attrs['href'] = attrs['src'] alt = attrs.get('alt') or '' + # If we have a link to create, output the start + if not self.maybe_automatic_link is None: + href = self.maybe_automatic_link + if self.images_to_alt and escape_md(alt) == href and \ + self.absolute_url_matcher.match(href): + self.o("<" + escape_md(alt) + ">") + self.empty_link = False + return + else: + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + # If we have images_to_alt, we discard the image itself, # considering only the alt text. if self.images_to_alt: @@ -637,10 +656,12 @@ def handle_data(self, data): href = self.maybe_automatic_link if href == data and self.absolute_url_matcher.match(href): self.o("<" + data + ">") + self.empty_link = False return else: self.o("[") self.maybe_automatic_link = None + self.empty_link = False if not self.code and not self.pre: data = escape_md_section(data, snob=self.escape_snob) diff --git a/test/empty-link.html b/test/empty-link.html new file mode 100644 index 00000000..7f1c3eb1 --- /dev/null +++ b/test/empty-link.html @@ -0,0 +1,6 @@ +

      Processing empty hyperlinks

      + +

      This test checks wheter empty hyperlinks still appear in the markdown result.

      + + +

      \ No newline at end of file diff --git a/test/empty-link.md b/test/empty-link.md new file mode 100644 index 00000000..fbaf01bc --- /dev/null +++ b/test/empty-link.md @@ -0,0 +1,8 @@ +# Processing empty hyperlinks + +This test checks wheter empty hyperlinks still appear in the markdown result. + +[](http://some.link) + +[](http://some.link) + diff --git a/test/images_to_alt.html b/test/images_to_alt.html index b41b6928..8486dd54 100644 --- a/test/images_to_alt.html +++ b/test/images_to_alt.html @@ -1,3 +1,5 @@ ALT TEXT +
      +ALT TEXT \ No newline at end of file diff --git a/test/images_to_alt.md b/test/images_to_alt.md index 2dd2299b..369eb934 100644 --- a/test/images_to_alt.md +++ b/test/images_to_alt.md @@ -1,2 +1,3 @@ -[ ALT TEXT ](http://example.com) +[ ALT TEXT ](http://example.com) +[ALT TEXT](http://example.com) diff --git a/test/img-tag-with-link.html b/test/img-tag-with-link.html new file mode 100644 index 00000000..92d3a96e --- /dev/null +++ b/test/img-tag-with-link.html @@ -0,0 +1,9 @@ +

      Processing images with links

      + +

      This test checks images with associated links.

      + +(banana) +[banana] +{banana} +([{}]) + \ No newline at end of file diff --git a/test/img-tag-with-link.md b/test/img-tag-with-link.md new file mode 100644 index 00000000..3025423e --- /dev/null +++ b/test/img-tag-with-link.md @@ -0,0 +1,10 @@ +# Processing images with links + +This test checks images with associated links. + +[![\(banana\)](http://placehold.it/350x150#\(banana\))](http://some.link) +[![\[banana\]](http://placehold.it/350x150#\[banana\])](http://some.link) +[![{banana}](http://placehold.it/350x150#{banana})](http://some.link) +[![\(\[{}\]\)](http://placehold.it/350x150#\(\[{}\]\))](http://some.link) +[![](http://placehold.it/350x150#\(\[{}\]\))](http://some.link) + From 11c508d211834a0c4e6a1289a331784928ade468 Mon Sep 17 00:00:00 2001 From: Miguel Tavares Date: Tue, 17 Feb 2015 18:41:24 +0000 Subject: [PATCH 150/479] Added missing test case to images-to-alt --- test/images_to_alt.html | 4 +++- test/images_to_alt.md | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/images_to_alt.html b/test/images_to_alt.html index 8486dd54..9d135458 100644 --- a/test/images_to_alt.html +++ b/test/images_to_alt.html @@ -2,4 +2,6 @@ ALT TEXT
      -ALT TEXT \ No newline at end of file +ALT TEXT +
      +http://example.com \ No newline at end of file diff --git a/test/images_to_alt.md b/test/images_to_alt.md index 369eb934..c07f1f1d 100644 --- a/test/images_to_alt.md +++ b/test/images_to_alt.md @@ -1,3 +1,4 @@ [ ALT TEXT ](http://example.com) -[ALT TEXT](http://example.com) +[ALT TEXT](http://example.com) + From 84ce93b938f6d2af63e1ba34bd542ede407c62e9 Mon Sep 17 00:00:00 2001 From: Miguel Tavares Date: Wed, 18 Feb 2015 10:52:54 +0000 Subject: [PATCH 151/479] Update authors, changelog and version number. Added latest contributor to authors list, reflected changes on changelog and bumped version number. --- AUTHORS.rst | 1 + ChangeLog.rst | 7 +++++++ setup.py | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 5b9e0dd1..662e9ec4 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -11,6 +11,7 @@ The AUTHORS/Contributors are (and/or have been): * Ivan Gromov * Jocelyn Delalande * Matt Dorn +* Miguel Tavares Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 07155763..753d40db 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +2015.02.18 +========== +---- + +* Fix #38: Anchor tags with empty text or with `` tags inside are no longer stripped. + + 2014.12.29 ========== ---- diff --git a/setup.py b/setup.py index 20863f5d..6bb1230c 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2014.12.29", + version="2015.02.18", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 3f15e224e10a936f6cbdb9e8a96a093ef98557f1 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 18 Feb 2015 17:19:57 +0400 Subject: [PATCH 152/479] Fix version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6bb1230c..47919440 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2015.02.18", + version="2015.2.18", description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From cf89ea8b5dd0c8f8b2c3b91eee78fbc338337743 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 18 Feb 2015 17:20:29 +0400 Subject: [PATCH 153/479] Update ChangeLog.rst --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 753d40db..96b58c3d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -2015.02.18 +2015.2.18 ========== ---- From 2c79945cda9567991668577b7ded46a51574256a Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Fri, 27 Mar 2015 00:14:29 +0100 Subject: [PATCH 154/479] Treat '-' file parameter as stdin Adds compatibility for a common way to express reading from stdin. Unbreaks lesspipe.sh for example on Arch Linux. --- html2text/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index fb951b12..47d3d941 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -118,7 +118,7 @@ def main(): # process input encoding = "utf-8" - if len(args) > 0: + if len(args) > 0 and args[0] != '-': file_ = args[0] if len(args) == 2: encoding = args[1] From 446a8eb0733835fa2b2e61b8e311a02f9325cf00 Mon Sep 17 00:00:00 2001 From: Scott Blackburn Date: Fri, 10 Apr 2015 13:25:39 -0700 Subject: [PATCH 155/479] Retain escaping of html except within code or pre tags. --- html2text/__init__.py | 14 +++++++++++--- test/html-escaping.html | 3 +++ test/html-escaping.md | 8 ++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 test/html-escaping.html create mode 100644 test/html-escaping.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 2ee872fc..48d44542 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -3,6 +3,7 @@ """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division import re +import cgi try: from textwrap import wrap @@ -160,10 +161,16 @@ def close(self): return outtext def handle_charref(self, c): - self.o(self.charref(c), 1) + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.o(charref, 1) def handle_entityref(self, c): - self.o(self.entityref(c), 1) + entityref = self.entityref(c) + if not self.code and not self.pre and entityref != ' _place_holder;': + entityref = cgi.escape(entityref) + self.o(entityref, 1) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -351,6 +358,7 @@ def handle_tag(self, tag, attrs, start): if tag in ["code", "tt"] and not self.pre: self.o('`') # TODO: `` `this` `` + self.code = not self.code if tag == "abbr": if start: self.abbr_title = None @@ -416,7 +424,7 @@ def handle_tag(self, tag, attrs, start): else: self.o("[") self.maybe_automatic_link = None - self.empty_link = False + self.empty_link = False # If we have images_to_alt, we discard the image itself, # considering only the alt text. diff --git a/test/html-escaping.html b/test/html-escaping.html new file mode 100644 index 00000000..b6f1da74 --- /dev/null +++ b/test/html-escaping.html @@ -0,0 +1,3 @@ +

      Escaped HTML like <div> or & should remain escaped on output

      +
      ...unless that escaped HTML is in a <pre> tag
      +...or a <code> tag \ No newline at end of file diff --git a/test/html-escaping.md b/test/html-escaping.md new file mode 100644 index 00000000..19e91eeb --- /dev/null +++ b/test/html-escaping.md @@ -0,0 +1,8 @@ +Escaped HTML like <div> or & should remain escaped on output + + + + ...unless that escaped HTML is in a
       tag
      +
      +`...or a  tag`
      +
      
      From 27a62b01b705e09440eefc6006b5582ed5ae30df Mon Sep 17 00:00:00 2001
      From: Alireza Savand 
      Date: Mon, 13 Apr 2015 19:51:01 +0400
      Subject: [PATCH 156/479] Bump version numbers for new release
      
      ---
       html2text/__init__.py | 2 +-
       setup.py              | 2 +-
       2 files changed, 2 insertions(+), 2 deletions(-)
      
      diff --git a/html2text/__init__.py b/html2text/__init__.py
      index 48d44542..4e456ace 100644
      --- a/html2text/__init__.py
      +++ b/html2text/__init__.py
      @@ -29,7 +29,7 @@
           skipwrap
       )
       
      -__version__ = "2014.12.29"
      +__version__ = "2015.4.13"
       
       
       # TODO:
      diff --git a/setup.py b/setup.py
      index 47919440..5493c094 100644
      --- a/setup.py
      +++ b/setup.py
      @@ -34,7 +34,7 @@ def run(self):
       
       setup(
           name="html2text",
      -    version="2015.2.18",
      +    version="2015.4.13",
           description="Turn HTML into equivalent Markdown-structured text.",
           author="Aaron Swartz",
           author_email="me@aaronsw.com",
      
      From 234071841bbb6dd0fca0b53fb4ed56229df0caa8 Mon Sep 17 00:00:00 2001
      From: Alireza Savand 
      Date: Mon, 13 Apr 2015 19:52:26 +0400
      Subject: [PATCH 157/479] Add Scott Blackburn to Authors list
      
      Thanks @smblackburn
      ---
       AUTHORS.rst | 1 +
       1 file changed, 1 insertion(+)
      
      diff --git a/AUTHORS.rst b/AUTHORS.rst
      index 662e9ec4..dc53c805 100644
      --- a/AUTHORS.rst
      +++ b/AUTHORS.rst
      @@ -12,6 +12,7 @@ The AUTHORS/Contributors are (and/or have been):
       * Jocelyn Delalande 
       * Matt Dorn 
       * Miguel Tavares 
      +* Scott Blackburn 
       
       Maintainer:
       
      
      From bf1e587004e9740769fc9b81d2f4307e7f52d197 Mon Sep 17 00:00:00 2001
      From: Alireza Savand 
      Date: Mon, 13 Apr 2015 19:52:46 +0400
      Subject: [PATCH 158/479] Add Peter Wu to Authors list
      
      Thanks @Lekensteyn
      ---
       AUTHORS.rst | 2 ++
       1 file changed, 2 insertions(+)
      
      diff --git a/AUTHORS.rst b/AUTHORS.rst
      index dc53c805..7e05a004 100644
      --- a/AUTHORS.rst
      +++ b/AUTHORS.rst
      @@ -13,6 +13,8 @@ The AUTHORS/Contributors are (and/or have been):
       * Matt Dorn 
       * Miguel Tavares 
       * Scott Blackburn 
      +* Peter Wu 
      +
       
       Maintainer:
       
      
      From 393a9f1550afce33a8a0bcbb32ba20a19e39be99 Mon Sep 17 00:00:00 2001
      From: Alireza Savand 
      Date: Mon, 13 Apr 2015 19:53:00 +0400
      Subject: [PATCH 159/479] Update changelog ;)
      
      ---
       ChangeLog.rst | 9 +++++++++
       1 file changed, 9 insertions(+)
      
      diff --git a/ChangeLog.rst b/ChangeLog.rst
      index 96b58c3d..d25db5a6 100644
      --- a/ChangeLog.rst
      +++ b/ChangeLog.rst
      @@ -1,3 +1,12 @@
      +2015.4.13
      +=========
      +----
      +
      +
      +* Feature #56: Treat '-' file parameter as stdin.
      +* Feature #57: Retain escaping of html except within code or pre tags.
      +
      +
       2015.2.18
       ==========
       ----
      
      From 9278557d5e5a11a538299a81b562ea3d6a34dc60 Mon Sep 17 00:00:00 2001
      From: Scott Blackburn 
      Date: Mon, 13 Apr 2015 23:43:39 -0700
      Subject: [PATCH 160/479] Support for image sizing using raw html
      
      ---
       html2text/__init__.py      | 15 +++++++++++++++
       html2text/cli.py           |  8 ++++++++
       html2text/config.py        |  1 +
       test/images_with_size.html |  7 +++++++
       test/images_with_size.md   |  6 ++++++
       test/test_html2text.py     |  4 ++++
       6 files changed, 41 insertions(+)
       create mode 100644 test/images_with_size.html
       create mode 100644 test/images_with_size.md
      
      diff --git a/html2text/__init__.py b/html2text/__init__.py
      index 4e456ace..62a55f5a 100644
      --- a/html2text/__init__.py
      +++ b/html2text/__init__.py
      @@ -61,6 +61,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
               self.ignore_links = config.IGNORE_ANCHORS
               self.ignore_images = config.IGNORE_IMAGES
               self.images_to_alt = config.IMAGES_TO_ALT
      +        self.images_with_size = config.IMAGES_WITH_SIZE
               self.ignore_emphasis = config.IGNORE_EMPHASIS
               self.bypass_tables = config.BYPASS_TABLES
               self.google_doc = False
      @@ -413,6 +414,20 @@ def handle_tag(self, tag, attrs, start):
                           attrs['href'] = attrs['src']
                       alt = attrs.get('alt') or ''
       
      +                # If we have images_with_size, write raw html including width,
      +                # height, and alt attributes
      +                if self.images_with_size and \
      +                        ("width" in attrs or "height" in attrs):
      +                    self.o("")
      +                    return
      +
                       # If we have a link to create, output the start
                       if not self.maybe_automatic_link is None:
                           href = self.maybe_automatic_link
      diff --git a/html2text/cli.py b/html2text/cli.py
      index 47d3d941..00736948 100644
      --- a/html2text/cli.py
      +++ b/html2text/cli.py
      @@ -44,6 +44,13 @@ def main():
               default=config.IMAGES_TO_ALT,
               help="Discard image data, only keep alt text"
           )
      +    p.add_option(
      +        "--images-with-size",
      +        dest="images_with_size",
      +        action="store_true",
      +        default=config.IMAGES_WITH_SIZE,
      +        help="Write image tags as raw html with height and width attrs"
      +    )
           p.add_option(
               "-g", "--google-doc",
               action="store_true",
      @@ -166,6 +173,7 @@ def main():
           h.protect_links = options.protect_links
           h.ignore_images = options.ignore_images
           h.images_to_alt = options.images_to_alt
      +    h.images_with_size = options.images_with_size
           h.google_doc = options.google_doc
           h.hide_strikethrough = options.hide_strikethrough
           h.escape_snob = options.escape_snob
      diff --git a/html2text/config.py b/html2text/config.py
      index 04a8a359..2eef864b 100644
      --- a/html2text/config.py
      +++ b/html2text/config.py
      @@ -30,6 +30,7 @@
       IGNORE_ANCHORS = False
       IGNORE_IMAGES = False
       IMAGES_TO_ALT = False
      +IMAGES_WITH_SIZE = False
       IGNORE_EMPHASIS = False
       
       # For checking space-only lines on line 771
      diff --git a/test/images_with_size.html b/test/images_with_size.html
      new file mode 100644
      index 00000000..bcf8e44c
      --- /dev/null
      +++ b/test/images_with_size.html
      @@ -0,0 +1,7 @@
      +An image without dimensions
      +
      +An image with a width attr
      +
      +An image with a height attr
      +
      +An image with width and height
      \ No newline at end of file
      diff --git a/test/images_with_size.md b/test/images_with_size.md
      new file mode 100644
      index 00000000..cf710736
      --- /dev/null
      +++ b/test/images_with_size.md
      @@ -0,0 +1,6 @@
      +![An image without dimensions](image_without_dimensions.jpg) An image with a width attr
      +An image with a height attr An
      +image with width and height
      +
      diff --git a/test/test_html2text.py b/test/test_html2text.py
      index 167fab73..0be96764 100644
      --- a/test/test_html2text.py
      +++ b/test/test_html2text.py
      @@ -135,6 +135,10 @@ def test_cmd(self):
               module_args['images_to_alt'] = True
               cmdline_args.append('--images-to-alt')
       
      +    if base_fn.startswith('images_with_size'):
      +        module_args['images_with_size'] = True
      +        cmdline_args.append('--images-with-size')
      +
           if base_fn.startswith('single_line_break'):
               module_args['body_width'] = 0
               cmdline_args.append('--body-width=0')
      
      From 84e77d7b07e2e1e23995860ed69cc8a0e8b86cee Mon Sep 17 00:00:00 2001
      From: Scott Blackburn 
      Date: Tue, 14 Apr 2015 08:51:44 -0700
      Subject: [PATCH 161/479] Documenting sized image feature in README and
       Changelog
      
      ---
       ChangeLog.rst    | 8 ++++++++
       README.md        | 9 +++++----
       html2text/cli.py | 3 ++-
       3 files changed, 15 insertions(+), 5 deletions(-)
      
      diff --git a/ChangeLog.rst b/ChangeLog.rst
      index d25db5a6..4d5fe984 100644
      --- a/ChangeLog.rst
      +++ b/ChangeLog.rst
      @@ -1,3 +1,11 @@
      +2015.4.14
      +=========
      +----
      +
      +
      +* Feature #59: Write image tags with height and width attrs as raw html to retain dimensions
      +
      +
       2015.4.13
       =========
       ----
      diff --git a/README.md b/README.md
      index 650ce2b4..8f08046a 100644
      --- a/README.md
      +++ b/README.md
      @@ -16,14 +16,15 @@ html2text is a Python script that converts a page of HTML into clean, easy-to-re
       Usage: `html2text [(filename|url) [encoding]]`
       
       
      -| Option                                                 | Description            
      +| Option                                                 | Description
       |--------------------------------------------------------|---------------------------------------------------
      -| `--version`                                            | Show program's version number and exit 
      -| `-h`, `--help`                                         | Show this help message and exit      
      +| `--version`                                            | Show program's version number and exit
      +| `-h`, `--help`                                         | Show this help message and exit
       | `--ignore-links`                                       | Don't include any formatting for links
       |`--protect-links`                                       | Protect links from line breaks surrounding them "+" with angle brackets
       |`--ignore-images`                                       | Don't include any formatting for images
       |`--images-to-alt`                                       | Discard image data, only keep alt text
      +|`--images-with-size`                                    | Write image tags with height and width attrs as raw html to retain dimensions
       |`-g`, `--google-doc`                                    | Convert an html-exported Google Document
       |`-d`, `--dash-unordered-list`                           | Use a dash rather than a star for unordered list items
       |`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH`          | Number of characters per output line, `0` for no wrap
      @@ -48,7 +49,7 @@ Or you can use it from within `Python`:
       Or with some configuration options:
       ```
       >>> import html2text
      ->>> 
      +>>>
       >>> h = html2text.HTML2Text()
       >>> # Ignore converting links from HTML
       >>> h.ignore_links = True
      diff --git a/html2text/cli.py b/html2text/cli.py
      index 00736948..bd3a2ce2 100644
      --- a/html2text/cli.py
      +++ b/html2text/cli.py
      @@ -49,7 +49,8 @@ def main():
               dest="images_with_size",
               action="store_true",
               default=config.IMAGES_WITH_SIZE,
      -        help="Write image tags as raw html with height and width attrs"
      +        help="Write image tags with height and width attrs as raw html to "
      +             "retain dimensions"
           )
           p.add_option(
               "-g", "--google-doc",
      
      From e902a7c7bb9adb77254c08863933ce6d8313f387 Mon Sep 17 00:00:00 2001
      From: Alireza Savand 
      Date: Tue, 14 Apr 2015 21:09:26 +0400
      Subject: [PATCH 162/479] Bump version number to latest.
      
      Going to release now!!!
      ---
       html2text/__init__.py | 2 +-
       setup.py              | 2 +-
       2 files changed, 2 insertions(+), 2 deletions(-)
      
      diff --git a/html2text/__init__.py b/html2text/__init__.py
      index 62a55f5a..1f3e803f 100644
      --- a/html2text/__init__.py
      +++ b/html2text/__init__.py
      @@ -29,7 +29,7 @@
           skipwrap
       )
       
      -__version__ = "2015.4.13"
      +__version__ = "2015.4.14"
       
       
       # TODO:
      diff --git a/setup.py b/setup.py
      index 5493c094..24c284d0 100644
      --- a/setup.py
      +++ b/setup.py
      @@ -34,7 +34,7 @@ def run(self):
       
       setup(
           name="html2text",
      -    version="2015.4.13",
      +    version="2015.4.14",
           description="Turn HTML into equivalent Markdown-structured text.",
           author="Aaron Swartz",
           author_email="me@aaronsw.com",
      
      From 945c50d2e8e99ce29818b91c1f64866cc0bc84ae Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 11:31:12 +0530
      Subject: [PATCH 163/479] Update gitignore to include virtualenv
      
      ---
       .gitignore | 1 +
       1 file changed, 1 insertion(+)
      
      diff --git a/.gitignore b/.gitignore
      index 0eaa5553..de204b56 100644
      --- a/.gitignore
      +++ b/.gitignore
      @@ -6,3 +6,4 @@ dist
       *.egg-info
       .idea
       .coverage
      +env/
      
      From 68acdc12247c0789c46b7e5a828b42a9d3b85994 Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 11:45:20 +0530
      Subject: [PATCH 164/479] Added failing test
      
      ---
       test/malformed_link_output.html | 8 ++++++++
       1 file changed, 8 insertions(+)
       create mode 100644 test/malformed_link_output.html
      
      diff --git a/test/malformed_link_output.html b/test/malformed_link_output.html
      new file mode 100644
      index 00000000..5e949a9d
      --- /dev/null
      +++ b/test/malformed_link_output.html
      @@ -0,0 +1,8 @@
      +
      +    
      +        http://www.test.com
      +        http://www.test.com
      +        test
      +        test
      +    
      +
      
      From 2fc2e20c4b93c60aff4e4b7207a537ab0faaecc6 Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 13:55:02 +0530
      Subject: [PATCH 165/479] Little more readable
      
      ---
       html2text/utils.py | 6 +++++-
       1 file changed, 5 insertions(+), 1 deletion(-)
      
      diff --git a/html2text/utils.py b/html2text/utils.py
      index 0946930f..1bc4102e 100644
      --- a/html2text/utils.py
      +++ b/html2text/utils.py
      @@ -5,6 +5,7 @@
       
       
       def name2cp(k):
      +    """Return sname to codepoint"""
           if k == 'apos':
               return ord("'")
           return htmlentitydefs.name2codepoint[k]
      @@ -32,7 +33,10 @@ def dumb_property_dict(style):
           """
           out = dict([(x.strip(), y.strip()) for x, y in
                       [z.split(':', 1) for z in
      -                 style.split(';') if ':' in z]])
      +                 style.split(';') if ':' in z
      +                 ]
      +                ]
      +               )
       
           return out
       
      
      From 13d6be2ac56daa3b84a3d40e460a75be6d3ef650 Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 14:12:21 +0530
      Subject: [PATCH 166/479] Started adding docs. Added docs for 	- utils.py 
       - compat.py 	- config.py 	- cli.py
      
      Started adding docs for __init__.py
      
      Created teh index of documentation.
      ---
       docs/about.md        |  10 +++++
       docs/how_it_works.md | 101 +++++++++++++++++++++++++++++++++++++++++++
       docs/index.md        |  10 +++++
       3 files changed, 121 insertions(+)
       create mode 100644 docs/about.md
       create mode 100644 docs/how_it_works.md
       create mode 100644 docs/index.md
      
      diff --git a/docs/about.md b/docs/about.md
      new file mode 100644
      index 00000000..972572f9
      --- /dev/null
      +++ b/docs/about.md
      @@ -0,0 +1,10 @@
      +About
      +-----
      +
      +html2text is a python package which converts a page of HTML into clean,
      +easy-to-read plain ASCII text. Better yet, that ASCII also happens to be
      +valid Markdown (a text-to-HTML format).
      +
      +It was origionally written by Aaron Swartz.
      +
      +The code is under GPL v3.
      diff --git a/docs/how_it_works.md b/docs/how_it_works.md
      new file mode 100644
      index 00000000..45400ba1
      --- /dev/null
      +++ b/docs/how_it_works.md
      @@ -0,0 +1,101 @@
      +Introduction
      +============
      +
      +
      +There are 5 components to the code. They are kept as seperate files in the
      +html2text directory. This part of the documentation explains them bit by bit.
      +
      +
      +compat.py
      +---------
      +
      +This part exists only to test compatibility with the available python standard libraries. Python3 relocated some libraries and so this file makes sure that everything has a common interface.
      +
      +config.py
      +---------
      +
      +Used to provide various configuration settings to the converter. They are as follows:
      +    - UNICODE_SNOB for using unicode
      +    - ESCAPE_SNOB for escaping every special character
      +    - LINKS_EACH_PARAGRAPH for putting links after every paragraph
      +    - BODY_WIDTH for wrapping long lines
      +    - SKIP_INTERNAL_LINKS to skip #local-anchor things
      +    - INLNE_LINKS for formatting images and links
      +    - PROTECT_LINKS protect from line breaks
      +    - GOOGLE_LIST_INDENT no of pixels to indent nested lists
      +    - IGNORE_ANCHORS
      +    - IGNORE_IMAGES
      +    - IMAGES_TO_ALT
      +    - IMAGES_WITH_SIZE
      +    - IGNORE_EMPHASIS
      +    - BYPASS_TABLES
      +    - SINGLE_LINE_BREAK to use a single line break rather than two
      +    - UNIFIABLE is a dictionary which maps unicode abbrevations to ASCII
      +                values
      +    - RE_SPACE for finding space-only lines
      +    - RE_UNESCAPE for finding html entities like  
      +    - RE_ORDERED_LIST_MATCHER for matching ordered lists in MD
      +    - RE_UNORDERED_LIST_MATCHER for matching unordered list matcher in MD
      +    - RE_MD_CHARS_MATCHER for matching Md \,[,],( and )
      +    - RE_MD_CHARS_MATCHER_ALL for matching `,*,_,{,},[,],(,),#,!
      +    - RE_MD_DOT_MATCHER for matching lines starting with 1.
      +    - RE_MD_PLUS_MATCHER for matching lines starting with +
      +    - RE_MD_DASH_MATCHER for matching lines starting with (-)
      +    - RE_SLASH_CHARS a string of slash escapeable characters
      +    - RE_MD_BACKSLASH_MATCHER to match \char
      +
      +utils.py
      +--------
      +
      +Used to provide utility functions to html2text
      +Some functions are:
      +    
      +    - name2cp                   :name to code point
      +    - hn                        :headings
      +    - dumb_preperty_dict        :hash of css attrs
      +    - dumb_css_parser           :returns a hash of css selectors, each
      +                                 containing a hash of css attrs
      +    - element_style             :hash of final style of element
      +    - google_list_style         :find out ordered?unordered
      +    - google_has_height         :does element have height?
      +    - google_text_emphasis      :a list of all emphasis modifiers
      +    - google_fixed_width_font   :check for fixed width font
      +    - list_numbering_start      :extract numbering from list elem attrs
      +    - skipwrap                  :skip wrap for give para or not?
      +    - wrapwrite                 :write to buffer
      +    - wrap_read                 :read from buffer
      +    - escape_md                 :escape md sensitive within other md
      +    - escape_md_section         :escape md sensitive accrose whole doc
      +
      +
      +cli.py
      +------
      +
      +Command line interface for the code.
      +
      +
      +| Option                                                 | Description
      +|--------------------------------------------------------|---------------------------------------------------
      +| `--version`                                            | Show program version number and exit
      +| `-h`, `--help`                                         | Show this help message and exit
      +| `--ignore-links`                                       | Do not include any formatting for links
      +|`--protect-links`                                       | Protect links from line breaks surrounding them "+" with angle brackets
      +|`--ignore-images`                                       | Do not include any formatting for images
      +|`--images-to-alt`                                       | Discard image data, only keep alt text
      +|`--images-with-size`                                    | Write image tags with height and width attrs as raw html to retain dimensions
      +|`-g`, `--google-doc`                                    | Convert an html-exported Google Document
      +|`-d`, `--dash-unordered-list`                           | Use a dash rather than a star for unordered list items
      +|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH`          | Number of characters per output line, `0` for no wrap
      +|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists
      +|`-s`, `--hide-strikethrough`                            | Hide strike-through text. only relevent when `-g` is specified as well
      +|`--escape-all`                                          | Escape all special characters.  Output is less readable, but avoids corner case formatting issues.
      +| `--bypass-tables`                                      | Format tables in HTML rather than Markdown syntax.
      +| `--single-line-break`                                  | Use a single line break after a block element rather than two.
      +
      +
      +
      +__init__.py
      +-----------
      +
      +This is where everything comes together. This is the glue for all the
      +things we have described above.
      diff --git a/docs/index.md b/docs/index.md
      new file mode 100644
      index 00000000..5f9b23e7
      --- /dev/null
      +++ b/docs/index.md
      @@ -0,0 +1,10 @@
      +Html2Text
      +=========
      +
      +1. About
      +2. Authors
      +3. What is markdown
      +4. How it works
      +5. Usage
      +6. Contributing
      +7. Tests
      
      From 046aa5559f2f08cc48af301bbabf156e9ac8eb13 Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 14:35:00 +0530
      Subject: [PATCH 167/479] Added method list for HTML2Text class
      
      ---
       docs/how_it_works.md | 35 +++++++++++++++++++++++++++++++++++
       1 file changed, 35 insertions(+)
      
      diff --git a/docs/how_it_works.md b/docs/how_it_works.md
      index 45400ba1..e8bff9aa 100644
      --- a/docs/how_it_works.md
      +++ b/docs/how_it_works.md
      @@ -99,3 +99,38 @@ __init__.py
       
       This is where everything comes together. This is the glue for all the
       things we have described above.
      +
      +This file describes a single HTML2Text class which is itself a subclass of the HTMLParser in python
      +
      +Upon initialization it sets various config variables.
      +The class defines methods:
      +    - feed
      +    - handle
      +    - outtextf
      +    - close
      +    - handle_charref
      +    - handle_entityref
      +    - handle_starttag
      +    - handle_endtag
      +    - previousIndex
      +    - handle_emphasis
      +    - handle_tag
      +    - pbr
      +    - p
      +    - soft_br
      +    - o
      +    - handle_data
      +    - unknown_decl
      +    - charref
      +    - entityref
      +    - replaceEntities
      +    - unescape
      +    - google_nest_count
      +    - optwrap
      +
      +Besides this there are 2 more methods defined:
      +
      +    - html2text     :calls the HTML2Text class with .handle() method
      +    - unescape      :calls the HTML2Text class with .unescape() method
      +
      +The workflow of the application is simple.
      
      From 1ef63bb6863f6b4130547c14c5fc4af847a603dd Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 14:35:36 +0530
      Subject: [PATCH 168/479] Added todo in the html2text __init__.py file
      
      ---
       html2text/__init__.py | 2 ++
       1 file changed, 2 insertions(+)
      
      diff --git a/html2text/__init__.py b/html2text/__init__.py
      index 1f3e803f..2c118b38 100644
      --- a/html2text/__init__.py
      +++ b/html2text/__init__.py
      @@ -555,6 +555,7 @@ def handle_tag(self, tag, attrs, start):
                       self.pre = 0
                   self.p()
       
      +    # TODO: Add docstring for these one letter functions
           def pbr(self):
               if self.p_p == 0:
                   self.p_p = 1
      @@ -691,6 +692,7 @@ def handle_data(self, data):
               self.o(data, 1)
       
           def unknown_decl(self, data):
      +        # TODO: what is this doing here?
               pass
       
           def charref(self, name):
      
      From 6bceeadf19403dfa1409049c688d9acbaff6513c Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 15:34:57 +0530
      Subject: [PATCH 169/479] Added Aaron's affinity for google=search in the docs.
       Had me confused for quiet a while.
      
      ---
       docs/index.md | 5 +++++
       1 file changed, 5 insertions(+)
      
      diff --git a/docs/index.md b/docs/index.md
      index 5f9b23e7..10825b2a 100644
      --- a/docs/index.md
      +++ b/docs/index.md
      @@ -8,3 +8,8 @@ Html2Text
       5. Usage
       6. Contributing
       7. Tests
      +
      +Note: Aaron regularly used the word 'google' in place of 'search' which
      +can be found throughout the code. Non native English speakers are 
      +advised to mentally make the substitution if that makes the code easier
      +to read.
      
      From f078122088322b8207b7fc427d63e17f928d29a0 Mon Sep 17 00:00:00 2001
      From: theSage21 
      Date: Sun, 31 May 2015 15:46:28 +0530
      Subject: [PATCH 170/479] Basic Docs completed. File layout and description of
       code is there. Function descriptions are not yet there.
      
      ---
       docs/authors.md      | 1 +
       docs/how_it_works.md | 6 +++---
       docs/index.md        | 8 ++++----
       3 files changed, 8 insertions(+), 7 deletions(-)
       create mode 120000 docs/authors.md
      
      diff --git a/docs/authors.md b/docs/authors.md
      new file mode 120000
      index 00000000..5b54fb58
      --- /dev/null
      +++ b/docs/authors.md
      @@ -0,0 +1 @@
      +../AUTHORS.rst
      \ No newline at end of file
      diff --git a/docs/how_it_works.md b/docs/how_it_works.md
      index e8bff9aa..f46f1e1f 100644
      --- a/docs/how_it_works.md
      +++ b/docs/how_it_works.md
      @@ -102,7 +102,9 @@ things we have described above.
       
       This file describes a single HTML2Text class which is itself a subclass of the HTMLParser in python
       
      -Upon initialization it sets various config variables.
      +Upon initialization it sets various config variables necessary for
      +processing the given html in a certain manner necessary to create valid
      +markdown text.
       The class defines methods:
           - feed
           - handle
      @@ -132,5 +134,3 @@ Besides this there are 2 more methods defined:
       
           - html2text     :calls the HTML2Text class with .handle() method
           - unescape      :calls the HTML2Text class with .unescape() method
      -
      -The workflow of the application is simple.
      diff --git a/docs/index.md b/docs/index.md
      index 10825b2a..17cf314c 100644
      --- a/docs/index.md
      +++ b/docs/index.md
      @@ -1,10 +1,10 @@
       Html2Text
       =========
       
      -1. About
      -2. Authors
      -3. What is markdown
      -4. How it works
      +1. [About](about.md)
      +2. [Authors](authors.md)
      +3. [What is markdown](http://daringfireball.net/projects/markdown/)
      +4. [How it works](how_it_works.md)
       5. Usage
       6. Contributing
       7. Tests
      
      From 897a207fb49b569acdbde27f0341a412d1c007c6 Mon Sep 17 00:00:00 2001
      From: Ali Mohammad 
      Date: Mon, 1 Jun 2015 17:11:31 -0400
      Subject: [PATCH 171/479] Proper handling of anchors with content that starts
       with tags that produce markdown (such as , , etc.).
      
      ---
       html2text/__init__.py | 7 +++++++
       test/anchors.html     | 7 +++++++
       test/anchors.md       | 8 ++++++++
       3 files changed, 22 insertions(+)
       create mode 100644 test/anchors.html
       create mode 100644 test/anchors.md
      
      diff --git a/html2text/__init__.py b/html2text/__init__.py
      index 1f3e803f..2be3f7c2 100644
      --- a/html2text/__init__.py
      +++ b/html2text/__init__.py
      @@ -275,6 +275,13 @@ def handle_tag(self, tag, attrs, start):
               else:
                   attrs = dict(attrs)
       
      +        # first thing inside the anchor tag is another tag that produces some output
      +        if start and not self.maybe_automatic_link is None and \
      +             tag not in ['p', 'div', 'style', 'dl', 'dt'] and (tag != "img" or self.ignore_images):
      +            self.o("[")
      +            self.maybe_automatic_link = None
      +            self.empty_link = False
      +
               if self.google_doc:
                   # the attrs parameter is empty for a closing tag. in addition, we
                   # need the attributes of the parent nodes in order to get a
      diff --git a/test/anchors.html b/test/anchors.html
      new file mode 100644
      index 00000000..2a00d637
      --- /dev/null
      +++ b/test/anchors.html
      @@ -0,0 +1,7 @@
      +

      Processing hyperlinks

      + +

      Additional hyperlink tests!

      + +Bold Link +filename.py +The source code is called magic.py diff --git a/test/anchors.md b/test/anchors.md new file mode 100644 index 00000000..bd9fda5c --- /dev/null +++ b/test/anchors.md @@ -0,0 +1,8 @@ +# Processing hyperlinks + +Additional hyperlink tests! + +[**Bold Link**](http://some.link) +[`filename.py`](http://some.link/filename.py) [The source code is called +`magic.py`](http://some.link/magicsources.py) + From 3c5b7e641476d535fb4aad3581bb95cd3fdf963e Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 09:35:09 +0530 Subject: [PATCH 172/479] Fix for issue #61 Added option to opt out of automatic links --- html2text/__init__.py | 3 ++- html2text/config.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1f3e803f..4011de3f 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -69,6 +69,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.emphasis_mark = '_' self.strong_mark = '**' self.single_line_break = config.SINGLE_LINE_BREAK + self.use_automatic_links = config.USE_AUTOMATIC_LINKS if out is None: self.out = self.outtextf @@ -677,7 +678,7 @@ def handle_data(self, data): if not self.maybe_automatic_link is None: href = self.maybe_automatic_link - if href == data and self.absolute_url_matcher.match(href): + if href == data and self.absolute_url_matcher.match(href) and self.use_automatic_links: self.o("<" + data + ">") self.empty_link = False return diff --git a/html2text/config.py b/html2text/config.py index 2eef864b..07881fd9 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -33,6 +33,9 @@ IMAGES_WITH_SIZE = False IGNORE_EMPHASIS = False +# Convert links with same href and text to format if they are absolute links +USE_AUTOMATIC_LINKS = True + # For checking space-only lines on line 771 RE_SPACE = re.compile(r'\s\+') From dee3879773be78cf78cf08567d94890ad3ce429a Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 09:36:20 +0530 Subject: [PATCH 173/479] Added virtualenv to ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0eaa5553..de204b56 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ dist *.egg-info .idea .coverage +env/ From 4378be4837bd99ab2fb0a9c59723a396e32c618e Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 09:43:59 +0530 Subject: [PATCH 174/479] Added Arjoonn Sharma to authors --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 7e05a004..0d90205f 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -14,6 +14,7 @@ The AUTHORS/Contributors are (and/or have been): * Miguel Tavares * Scott Blackburn * Peter Wu +* Arjoonn Sharma Maintainer: From 614bb1ec09c4f2682d63b1a9621f18c6d65bed29 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:19:33 +0530 Subject: [PATCH 175/479] Added version one of the docs. Simple description of features and explanations --- docs/about.md | 3 ++ docs/contributing.md | 9 +++++ docs/how_it_works.md | 4 +++ docs/index.md | 11 ++---- docs/test.md | 13 +++++++ docs/usage.md | 83 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 docs/contributing.md create mode 100644 docs/test.md create mode 100644 docs/usage.md diff --git a/docs/about.md b/docs/about.md index 972572f9..72d50639 100644 --- a/docs/about.md +++ b/docs/about.md @@ -8,3 +8,6 @@ valid Markdown (a text-to-HTML format). It was origionally written by Aaron Swartz. The code is under GPL v3. + +The module is based on the html parser in the python standard library +and so any valid input for the parser is valid input for the library. diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 00000000..9507f58f --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,9 @@ +Pull requests are welcome. + +The package is developed [here](https://github.com/Alir3z4/html2text) + +Pull guidelines +--------------- + +- Make the changes modular. The usual method is one change per commit. +- That is all diff --git a/docs/how_it_works.md b/docs/how_it_works.md index f46f1e1f..ba3cc894 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -43,6 +43,7 @@ Used to provide various configuration settings to the converter. They are as fol - RE_MD_DASH_MATCHER for matching lines starting with (-) - RE_SLASH_CHARS a string of slash escapeable characters - RE_MD_BACKSLASH_MATCHER to match \char + - USE_AUTOMATIC_LINKS to convert http://xyz to utils.py -------- @@ -134,3 +135,6 @@ Besides this there are 2 more methods defined: - html2text :calls the HTML2Text class with .handle() method - unescape :calls the HTML2Text class with .unescape() method + +What they do is provide methods to make the HTML parser in python +parse the HTML and convert to markdown. diff --git a/docs/index.md b/docs/index.md index 17cf314c..878b7903 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,11 +5,6 @@ Html2Text 2. [Authors](authors.md) 3. [What is markdown](http://daringfireball.net/projects/markdown/) 4. [How it works](how_it_works.md) -5. Usage -6. Contributing -7. Tests - -Note: Aaron regularly used the word 'google' in place of 'search' which -can be found throughout the code. Non native English speakers are -advised to mentally make the substitution if that makes the code easier -to read. +5. [Usage](usage.md) +6. [Contributing](contributing.md) +7. [Tests](test.md) diff --git a/docs/test.md b/docs/test.md new file mode 100644 index 00000000..2feb5eb0 --- /dev/null +++ b/docs/test.md @@ -0,0 +1,13 @@ +Tests +===== + +Testing is essential. + +Run the tests +------------- + +`PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v` + +New tests +--------- +New tests are always welcome see [contributing](contributing.md) for guidelines. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..d01c1147 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,83 @@ +Usage +===== + +The module is simple enough to use. This tutorial will get you started. + +Installing +---------- + +These are the methods you can get the module installed:- + 1. `pip install html2text` for those who have pip + 2. Clone the repository from `https://github.com/Alir3z4/html2text.git` + 1. `git clone https://github.com/Alir3z4/html2text` + 2. `python setup build` + 3. `python setup install` + + +Basic Usage +----------- + +Once installed the module can be used as follows. + +>import html2text +>html = function_to_get_some_html() +>text = html2text.html2text(html) +>print(text) + +This converts the provided html to text( Markdown text) with all the +options set to default. + +Using Options +-------------- + +To customize the options provided by the module the usage is as follows: + +>import html2text +>text_maker = html2text.HTML2Text() +>text_maker.ignore_links = True +>text_maker.bypass_tables = False +>html = function_to_get_some_html() +>text = text_maker.handle(html) +>print(text) + + +Available Options +----------------- + +All options exist in the config.py file. A list is provided here with +simple indications of their function. + + + - UNICODE_SNOB for using unicode + - ESCAPE_SNOB for escaping every special character + - LINKS_EACH_PARAGRAPH for putting links after every paragraph + - BODY_WIDTH for wrapping long lines + - SKIP_INTERNAL_LINKS to skip #local-anchor things + - INLNE_LINKS for formatting images and links + - PROTECT_LINKS protect from line breaks + - GOOGLE_LIST_INDENT no of pixels to indent nested lists + - IGNORE_ANCHORS + - IGNORE_IMAGES + - IMAGES_TO_ALT + - IMAGES_WITH_SIZE + - IGNORE_EMPHASIS + - BYPASS_TABLES + - SINGLE_LINE_BREAK to use a single line break rather than two + - UNIFIABLE is a dictionary which maps unicode abbrevations to ASCII + values + - RE_SPACE for finding space-only lines + - RE_UNESCAPE for finding html entities like   + - RE_ORDERED_LIST_MATCHER for matching ordered lists in MD + - RE_UNORDERED_LIST_MATCHER for matching unordered list matcher in MD + - RE_MD_CHARS_MATCHER for matching Md \,[,],( and ) + - RE_MD_CHARS_MATCHER_ALL for matching `,*,_,{,},[,],(,),#,! + - RE_MD_DOT_MATCHER for matching lines starting with 1. + - RE_MD_PLUS_MATCHER for matching lines starting with + + - RE_MD_DASH_MATCHER for matching lines starting with (-) + - RE_SLASH_CHARS a string of slash escapeable characters + - RE_MD_BACKSLASH_MATCHER to match \char + - USE_AUTOMATIC_LINKS to convert http://xyz to + +To alter any option the procedure is to create a parser with +`parser = html2text.HTML2Text()` and to set the option on the parser. +example: `parser.unicode_snob = True` to set the UNICODE_SNOB option. From 54cf32492c0b9303753f28dc11aaa19d8c0ab739 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:24:45 +0530 Subject: [PATCH 176/479] Updated readme to include docs --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 8f08046a..590c7dc2 100644 --- a/README.md +++ b/README.md @@ -83,3 +83,7 @@ $ pip install html2text ## How to run unit tests PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + +## Documentation + +Documentation lives [here](docs/index.md) From f59e2762818882e07d42aed819608d937a73b4f9 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:27:32 +0530 Subject: [PATCH 177/479] Fixed authors documentation --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 878b7903..98529239 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,7 +2,7 @@ Html2Text ========= 1. [About](about.md) -2. [Authors](authors.md) +2. [Authors](../AUTHORS.rst) 3. [What is markdown](http://daringfireball.net/projects/markdown/) 4. [How it works](how_it_works.md) 5. [Usage](usage.md) From ab5e5ea8b951dae783ec182dce87afe8b963412a Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:29:33 +0530 Subject: [PATCH 178/479] Fixed list --- docs/how_it_works.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/how_it_works.md b/docs/how_it_works.md index ba3cc894..a045aa35 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -15,6 +15,7 @@ config.py --------- Used to provide various configuration settings to the converter. They are as follows: + - UNICODE_SNOB for using unicode - ESCAPE_SNOB for escaping every special character - LINKS_EACH_PARAGRAPH for putting links after every paragraph From 98b9a94e942d6119b7b12bb0ea0fad9c682d12e0 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:30:21 +0530 Subject: [PATCH 179/479] Fixed another list --- docs/how_it_works.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/how_it_works.md b/docs/how_it_works.md index a045aa35..89b07e88 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -108,6 +108,7 @@ Upon initialization it sets various config variables necessary for processing the given html in a certain manner necessary to create valid markdown text. The class defines methods: + - feed - handle - outtextf From 04d5c269ced47aa536c74abf32bc7456193c7b54 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:32:37 +0530 Subject: [PATCH 180/479] Fixed code documentation --- docs/usage.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index d01c1147..5e445630 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -7,6 +7,7 @@ Installing ---------- These are the methods you can get the module installed:- + 1. `pip install html2text` for those who have pip 2. Clone the repository from `https://github.com/Alir3z4/html2text.git` 1. `git clone https://github.com/Alir3z4/html2text` @@ -19,10 +20,10 @@ Basic Usage Once installed the module can be used as follows. ->import html2text ->html = function_to_get_some_html() ->text = html2text.html2text(html) ->print(text) + import html2text + html = function_to_get_some_html() + text = html2text.html2text(html) + print(text) This converts the provided html to text( Markdown text) with all the options set to default. @@ -32,13 +33,13 @@ Using Options To customize the options provided by the module the usage is as follows: ->import html2text ->text_maker = html2text.HTML2Text() ->text_maker.ignore_links = True ->text_maker.bypass_tables = False ->html = function_to_get_some_html() ->text = text_maker.handle(html) ->print(text) + import html2text + text_maker = html2text.HTML2Text() + text_maker.ignore_links = True + text_maker.bypass_tables = False + html = function_to_get_some_html() + text = text_maker.handle(html) + print(text) Available Options From a1177a5916486e58ad73b38b89257994ea5f1b57 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 3 Jun 2015 13:57:44 +0530 Subject: [PATCH 181/479] Push to add docs --- test/malformed_link_output.html | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 test/malformed_link_output.html diff --git a/test/malformed_link_output.html b/test/malformed_link_output.html deleted file mode 100644 index 5e949a9d..00000000 --- a/test/malformed_link_output.html +++ /dev/null @@ -1,8 +0,0 @@ - - - http://www.test.com - http://www.test.com - test - test - - From e7c3646601532bf0f5f90fa68d473a55179c6d8a Mon Sep 17 00:00:00 2001 From: Ali Mohammad Date: Wed, 3 Jun 2015 15:19:14 -0400 Subject: [PATCH 182/479] Update AUTHORS.rst --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 0d90205f..aa55be86 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -15,6 +15,7 @@ The AUTHORS/Contributors are (and/or have been): * Scott Blackburn * Peter Wu * Arjoonn Sharma +* Ali Mohammad Maintainer: From 7e5aab97a5a294e0ced09cedb634004fec873651 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 09:11:54 +0530 Subject: [PATCH 183/479] Added some more tests --- test/GoogleDocMassDownload.html | 2 +- test/apos_element.html | 5 +++++ test/apos_element.md | 2 ++ test/header_tags.html | 17 +++++++++++++++++ test/header_tags.md | 20 ++++++++++++++++++++ test/horizontal_rule.html | 5 +++++ test/horizontal_rule.md | 2 ++ 7 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 test/apos_element.html create mode 100644 test/apos_element.md create mode 100644 test/header_tags.html create mode 100644 test/header_tags.md create mode 100644 test/horizontal_rule.html create mode 100644 test/horizontal_rule.md diff --git a/test/GoogleDocMassDownload.html b/test/GoogleDocMassDownload.html index f31e90e7..784cf142 100644 --- a/test/GoogleDocMassDownload.html +++ b/test/GoogleDocMassDownload.html @@ -6,7 +6,7 @@ @import url(https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw); -

      +

      diff --git a/test/apos_element.html b/test/apos_element.html new file mode 100644 index 00000000..bce374e2 --- /dev/null +++ b/test/apos_element.html @@ -0,0 +1,5 @@ + + + ' + + diff --git a/test/apos_element.md b/test/apos_element.md new file mode 100644 index 00000000..12d94f4f --- /dev/null +++ b/test/apos_element.md @@ -0,0 +1,2 @@ +' + diff --git a/test/header_tags.html b/test/header_tags.html new file mode 100644 index 00000000..aad3e661 --- /dev/null +++ b/test/header_tags.html @@ -0,0 +1,17 @@ + + +

      H1

      +

      H1

      +

      H1

      +

      H1

      +
      H1
      +
      H1
      + H1 + H1 + H1 + H1 + H1 + H1 + NO number + + diff --git a/test/header_tags.md b/test/header_tags.md new file mode 100644 index 00000000..80a23023 --- /dev/null +++ b/test/header_tags.md @@ -0,0 +1,20 @@ +# H1 + +## H1 + +### H1 + +#### H1 + +##### H1 + +###### H1 + +####### H1 + +######## H1 + +######### H1 + +H1 H1 H1 NO number + diff --git a/test/horizontal_rule.html b/test/horizontal_rule.html new file mode 100644 index 00000000..f159f52c --- /dev/null +++ b/test/horizontal_rule.html @@ -0,0 +1,5 @@ + + +
      + + diff --git a/test/horizontal_rule.md b/test/horizontal_rule.md new file mode 100644 index 00000000..8ccef4e8 --- /dev/null +++ b/test/horizontal_rule.md @@ -0,0 +1,2 @@ +* * * + From b3f89cdc80b89608f21ba13dff37c0c106ef6696 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 10:05:55 +0530 Subject: [PATCH 184/479] Added more tests --- test/abbr_tag.html | 1 + test/abbr_tag.md | 4 ++++ test/blockquote_example.html | 3 +++ test/blockquote_example.md | 2 ++ test/bold_inside_link.html | 2 ++ test/bold_inside_link.md | 2 ++ 6 files changed, 14 insertions(+) create mode 100644 test/abbr_tag.html create mode 100644 test/abbr_tag.md create mode 100644 test/blockquote_example.html create mode 100644 test/blockquote_example.md create mode 100644 test/bold_inside_link.html create mode 100644 test/bold_inside_link.md diff --git a/test/abbr_tag.html b/test/abbr_tag.html new file mode 100644 index 00000000..e0e4a33c --- /dev/null +++ b/test/abbr_tag.html @@ -0,0 +1 @@ +TLA diff --git a/test/abbr_tag.md b/test/abbr_tag.md new file mode 100644 index 00000000..0b030ff3 --- /dev/null +++ b/test/abbr_tag.md @@ -0,0 +1,4 @@ +TLA + + *[TLA]: Three Letter Acronym + diff --git a/test/blockquote_example.html b/test/blockquote_example.html new file mode 100644 index 00000000..1749fb09 --- /dev/null +++ b/test/blockquote_example.html @@ -0,0 +1,3 @@ +
      +The time has come, the Walrus said, to speak of many things. +
      diff --git a/test/blockquote_example.md b/test/blockquote_example.md new file mode 100644 index 00000000..69859650 --- /dev/null +++ b/test/blockquote_example.md @@ -0,0 +1,2 @@ +> The time has come, the Walrus said, to speak of many things. + diff --git a/test/bold_inside_link.html b/test/bold_inside_link.html new file mode 100644 index 00000000..278603e7 --- /dev/null +++ b/test/bold_inside_link.html @@ -0,0 +1,2 @@ +Text +sample diff --git a/test/bold_inside_link.md b/test/bold_inside_link.md new file mode 100644 index 00000000..d2738ccf --- /dev/null +++ b/test/bold_inside_link.md @@ -0,0 +1,2 @@ +[**Text**](link.htm) [**sample**](/nothing/) + From 9b6ba81e55eaad539fd82566b537b35ae519a56d Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 10:06:24 +0530 Subject: [PATCH 185/479] Changed ordering to increase coverage --- html2text/compat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html2text/compat.py b/html2text/compat.py index 4bb4824d..b02fa17e 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -1,11 +1,11 @@ try: + import urllib.parse as urlparse + import html.entities as htmlentitydefs + import html.parser as HTMLParser +except ImportError: # Python2 import htmlentitydefs import urlparse import HTMLParser -except ImportError: # Python3 - import html.entities as htmlentitydefs - import urllib.parse as urlparse - import html.parser as HTMLParser try: # Python3 import urllib.request as urllib except ImportError: From 36fe88b52a4c4b82d0f861c093a4c2f1757a0c33 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 10:27:53 +0530 Subject: [PATCH 186/479] Tested for reference style links --- html2text/cli.py | 8 ++++++++ test/no_inline_links_example.html | 1 + test/no_inline_links_example.md | 4 ++++ test/test_html2text.py | 8 ++++++-- 4 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 test/no_inline_links_example.html create mode 100644 test/no_inline_links_example.md diff --git a/html2text/cli.py b/html2text/cli.py index bd3a2ce2..a3eb7da9 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -17,6 +17,13 @@ def main(): default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis" ) + p.add_option( + "--reference-links", + dest="inline_links", + action="store_false", + default=config.INLINE_LINKS, + help="use reference style links instead of inline links" + ) p.add_option( "--ignore-links", dest="ignore_links", @@ -180,5 +187,6 @@ def main(): h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break + h.inline_links = options.inline_links wrapwrite(h.handle(data)) diff --git a/test/no_inline_links_example.html b/test/no_inline_links_example.html new file mode 100644 index 00000000..982f54c5 --- /dev/null +++ b/test/no_inline_links_example.html @@ -0,0 +1 @@ +Googler diff --git a/test/no_inline_links_example.md b/test/no_inline_links_example.md new file mode 100644 index 00000000..2593475a --- /dev/null +++ b/test/no_inline_links_example.md @@ -0,0 +1,4 @@ +[Googler][1] + + [1]: http://google.com + diff --git a/test/test_html2text.py b/test/test_html2text.py index 0be96764..60cd0a11 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -93,7 +93,7 @@ def test_mod(self): def test_cmd(self): # Because there is no command-line option to control unicode_snob - if not 'unicode_snob' in module_args: + if 'unicode_snob' not in module_args: self.maxDiff = None result, actual = test_command(fn, *cmdline_args) self.assertEqual(result, actual) @@ -123,7 +123,7 @@ def test_cmd(self): cmdline_args.append('--bypass-tables') if base_fn.startswith('bodywidth'): - #module_args['unicode_snob'] = True + # module_args['unicode_snob'] = True module_args['body_width'] = 0 cmdline_args.append('--body-width=0') @@ -145,6 +145,10 @@ def test_cmd(self): module_args['single_line_break'] = True cmdline_args.append('--single-line-break') + if base_fn.startswith('no_inline_links'): + module_args['inline_links'] = False + cmdline_args.append('--reference-links') + return test_mod, test_cmd # Originally from http://stackoverflow.com/questions/32899/\ From d3d0eaabb3f5a57e9db4dc8f561d7ceef8be01fd Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 10:37:14 +0530 Subject: [PATCH 187/479] More tests for image alt --- test/no_inline_links_images_to_alt.html | 7 +++++++ test/no_inline_links_images_to_alt.md | 8 ++++++++ test/no_inline_links_nested.html | 1 + test/no_inline_links_nested.md | 6 ++++++ 4 files changed, 22 insertions(+) create mode 100644 test/no_inline_links_images_to_alt.html create mode 100644 test/no_inline_links_images_to_alt.md create mode 100644 test/no_inline_links_nested.html create mode 100644 test/no_inline_links_nested.md diff --git a/test/no_inline_links_images_to_alt.html b/test/no_inline_links_images_to_alt.html new file mode 100644 index 00000000..83c395d9 --- /dev/null +++ b/test/no_inline_links_images_to_alt.html @@ -0,0 +1,7 @@ + +ALT TEXT + +
      +ALT TEXT +
      +http://example.com diff --git a/test/no_inline_links_images_to_alt.md b/test/no_inline_links_images_to_alt.md new file mode 100644 index 00000000..83266dfe --- /dev/null +++ b/test/no_inline_links_images_to_alt.md @@ -0,0 +1,8 @@ +[ ![ALT TEXT][1] ][2] +[![ALT TEXT][1]][2] +[![http://example.com][1]][2] + + [1]: http://example.com/img.png + + [2]: http://example.com + diff --git a/test/no_inline_links_nested.html b/test/no_inline_links_nested.html new file mode 100644 index 00000000..2dc9b075 --- /dev/null +++ b/test/no_inline_links_nested.html @@ -0,0 +1 @@ +thisthat diff --git a/test/no_inline_links_nested.md b/test/no_inline_links_nested.md new file mode 100644 index 00000000..40d5abb2 --- /dev/null +++ b/test/no_inline_links_nested.md @@ -0,0 +1,6 @@ +[[this][1]that][2] + + [1]: /test2/ + + [2]: http://google.com + From c1034aa9d4412d13569767f0193b810f2f4b0373 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 10:54:57 +0530 Subject: [PATCH 188/479] Description list tests --- test/dl_tag_example.html | 22 ++++++++++++++++++++++ test/dl_tag_example.md | 27 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 test/dl_tag_example.html create mode 100644 test/dl_tag_example.md diff --git a/test/dl_tag_example.html b/test/dl_tag_example.html new file mode 100644 index 00000000..08e629a6 --- /dev/null +++ b/test/dl_tag_example.html @@ -0,0 +1,22 @@ +
      +
      Definition List
      +
      A list of terms and their definitions/descriptions.
      +
      Ordered List
      +
      A numbered list.
      +
      Unordered List
      +
      An unnumbered list.
      +
      + +

      Example 2

      +
      +
      Vocals
      +
      Bruce Dickinson
      +
      Guitar
      +
      Adrian Smith
      +
      Dave Murray
      +
      Janick Gers
      +
      Bass
      +
      Steve Harris
      +
      Drums
      +
      Nicko McBrain
      +
      diff --git a/test/dl_tag_example.md b/test/dl_tag_example.md new file mode 100644 index 00000000..b78dcabe --- /dev/null +++ b/test/dl_tag_example.md @@ -0,0 +1,27 @@ +Definition List + + A list of terms and their definitions/descriptions. +Ordered List + + A numbered list. +Unordered List + + An unnumbered list. + +#### Example 2 + +Vocals + + Bruce Dickinson +Guitar + + Adrian Smith + Dave Murray + Janick Gers +Bass + + Steve Harris +Drums + + Nicko McBrain + From d7cf8450c8b79b4b75309e0b980dcb1c20d1eeea Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 4 Jun 2015 21:40:29 +0530 Subject: [PATCH 189/479] Added docs for --reference-links flag in command line --- README.md | 2 ++ docs/how_it_works.md | 1 + 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 8f08046a..c16a0e8b 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ Usage: `html2text [(filename|url) [encoding]]` |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. | `--single-line-break` | Use a single line break after a block element rather than two. +| `--reference-links` | Use reference links instead of links to create markdown + Or you can use it from within `Python`: diff --git a/docs/how_it_works.md b/docs/how_it_works.md index ba3cc894..6c222a5f 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -92,6 +92,7 @@ Command line interface for the code. |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. | `--single-line-break` | Use a single line break after a block element rather than two. +| `--reference-links` | Use reference links instead of inline links to create markdown From 88759f3dd14013f341760a64523156217c348861 Mon Sep 17 00:00:00 2001 From: nikolas Date: Thu, 4 Jun 2015 16:41:20 -0400 Subject: [PATCH 190/479] Fix typo in config doc --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 5e445630..2605f518 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -54,7 +54,7 @@ simple indications of their function. - LINKS_EACH_PARAGRAPH for putting links after every paragraph - BODY_WIDTH for wrapping long lines - SKIP_INTERNAL_LINKS to skip #local-anchor things - - INLNE_LINKS for formatting images and links + - INLINE_LINKS for formatting images and links - PROTECT_LINKS protect from line breaks - GOOGLE_LIST_INDENT no of pixels to indent nested lists - IGNORE_ANCHORS From 1885442c98f03c7d2a28ae6f27f5b22d68743506 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Jun 2015 10:59:50 +0400 Subject: [PATCH 191/479] Remove extra char to fix messy syntax errros --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 4d5fe984..cdb25d5e 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -16,7 +16,7 @@ 2015.2.18 -========== +========= ---- * Fix #38: Anchor tags with empty text or with `` tags inside are no longer stripped. From 421bdb1409e2d908a8101e6e8e4e435160e5e359 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Jun 2015 11:14:27 +0400 Subject: [PATCH 192/479] Update changelog with the latest changes Making it ready for release. --- ChangeLog.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index cdb25d5e..11aa216c 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,16 @@ +2015.6.5 +======== + +* Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. +* Fix #61. Malformed links in markdown output. +* Fix #63: Nested code, anchor bug. +* Fix #64: Proper handling of anchors with content that starts with tags. +* Feature #67: Documentation all over the module. +that produce markdown produce markdown. +* Feature #70: Adding tests for the module. +* Fix #73: Typo in config documentation. + + 2015.4.14 ========= ---- From 0d32483d3a6d5eeaeab1daf09f83949c2bc8059d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Jun 2015 11:16:27 +0400 Subject: [PATCH 193/479] Better way of getting version number from module Fixes #62 --- ChangeLog.rst | 1 + html2text/__init__.py | 2 +- html2text/cli.py | 6 ++++-- setup.py | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 11aa216c..b6cf059d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ * Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. * Fix #61. Malformed links in markdown output. +* Feature #62: Automatic version number. * Fix #63: Nested code, anchor bug. * Fix #64: Proper handling of anchors with content that starts with tags. * Feature #67: Documentation all over the module. diff --git a/html2text/__init__.py b/html2text/__init__.py index 6f918936..c45236b3 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -29,7 +29,7 @@ skipwrap ) -__version__ = "2015.4.14" +__version__ = (2015, 6, 5) # TODO: diff --git a/html2text/cli.py b/html2text/cli.py index a3eb7da9..a2de1d20 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -8,8 +8,10 @@ def main(): baseurl = '' - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', - version='%prog ' + __version__) + p = optparse.OptionParser( + '%prog [(filename|url) [encoding]]', + version='%prog ' + ".".join(map(str, __version__)) + ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", diff --git a/setup.py b/setup.py index 24c284d0..4dbbdf41 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def run(self): setup( name="html2text", - version="2015.4.14", + version=".".join(map(str, __import__('html2text').__version__)), description="Turn HTML into equivalent Markdown-structured text.", author="Aaron Swartz", author_email="me@aaronsw.com", From 1e0de14cd7ec0dc43dce1fb7040f58b0df1384f2 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Jun 2015 11:35:39 +0400 Subject: [PATCH 194/479] Remove the typo from the ChangeLog.rst --- ChangeLog.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index b6cf059d..564ff46a 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -7,7 +7,6 @@ * Fix #63: Nested code, anchor bug. * Fix #64: Proper handling of anchors with content that starts with tags. * Feature #67: Documentation all over the module. -that produce markdown produce markdown. * Feature #70: Adding tests for the module. * Fix #73: Typo in config documentation. From 894269c93dfccbcb3aac72aeff0c181cfc5d671e Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 5 Jun 2015 11:38:28 +0400 Subject: [PATCH 195/479] Fix verion number typo --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 564ff46a..cee71f91 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -2015.6.5 +2015.6.6 ======== * Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. diff --git a/html2text/__init__.py b/html2text/__init__.py index c45236b3..b4f1530e 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -29,7 +29,7 @@ skipwrap ) -__version__ = (2015, 6, 5) +__version__ = (2015, 6, 6) # TODO: From 1d8f553be95ca4db8327a795d4d97db395841a4a Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 9 Jun 2015 16:12:46 +0530 Subject: [PATCH 196/479] Coverage now detects the command line tests which were run in subprocesses. --- .coveragerc | 5 +++++ README.md | 7 +++++++ docs/test.md | 7 +++++++ sitecustomize.py | 6 ++++++ 4 files changed, 25 insertions(+) create mode 100644 .coveragerc create mode 100644 sitecustomize.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..c03d93f3 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,5 @@ +[run] +branch=True +parallel=True +cover_pylib=False +source=./html2text diff --git a/README.md b/README.md index 0385446e..3ea44939 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,13 @@ $ pip install html2text PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + To see the coverage results: + + `coverage combine` + `coverage html` + + then open the `./htmlcov/index.html` file in your browser. + ## Documentation Documentation lives [here](docs/index.md) diff --git a/docs/test.md b/docs/test.md index 2feb5eb0..89ae8b03 100644 --- a/docs/test.md +++ b/docs/test.md @@ -8,6 +8,13 @@ Run the tests `PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v` +Coverage results can be seen with + +`coverage combine` +`coverage html` + +and then opening the `./htmlcov/index.html` file with your browser. + New tests --------- New tests are always welcome see [contributing](contributing.md) for guidelines. diff --git a/sitecustomize.py b/sitecustomize.py new file mode 100644 index 00000000..8a045a92 --- /dev/null +++ b/sitecustomize.py @@ -0,0 +1,6 @@ +import os +import coverage +here = os.getcwd() +config_file = os.path.join(here, '.coveragerc') +os.environ['COVERAGE_PROCESS_START'] = config_file +coverage.process_startup() From 5de4bb2d801053c3846910c476ac1ee6b9c8628e Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 9 Jun 2015 16:18:08 +0530 Subject: [PATCH 197/479] README beauty fix --- README.md | 8 ++++---- docs/test.md | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3ea44939..7b460aa2 100644 --- a/README.md +++ b/README.md @@ -86,12 +86,12 @@ $ pip install html2text PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v - To see the coverage results: +To see the coverage results: - `coverage combine` - `coverage html` + coverage combine + coverage html - then open the `./htmlcov/index.html` file in your browser. +then open the `./htmlcov/index.html` file in your browser. ## Documentation diff --git a/docs/test.md b/docs/test.md index 89ae8b03..1ff19d91 100644 --- a/docs/test.md +++ b/docs/test.md @@ -11,6 +11,7 @@ Run the tests Coverage results can be seen with `coverage combine` + `coverage html` and then opening the `./htmlcov/index.html` file with your browser. From 06c8be3bfb6997a5202caf4ed1ce4a413f2369e3 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 9 Jun 2015 22:06:40 +0530 Subject: [PATCH 198/479] Coveralls fix --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 66b5fe40..27348e27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,4 +13,5 @@ before_script: script: PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v after_success: + coverage combine coveralls From cece37f4a80485d380ac01af90f3063175c991c7 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 9 Jun 2015 22:40:47 +0530 Subject: [PATCH 199/479] Travis fix --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 27348e27..fc1460ea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,5 +13,5 @@ before_script: script: PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v after_success: - coverage combine - coveralls + -coverage combine + -coveralls From e136803138583a7567c38d97b848a49ef7dd8a51 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 9 Jun 2015 22:55:54 +0530 Subject: [PATCH 200/479] Last travis fix attempt for the night --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index fc1460ea..38e0d164 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ install: before_script: - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' script: - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + -PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + -coverage combine after_success: - -coverage combine - -coveralls + coveralls From 8ceaa35f96fc9f01fe8ea71cfc635731f925f8c6 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 9 Jun 2015 23:00:20 +0530 Subject: [PATCH 201/479] travisymp fix again --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 38e0d164..60a6ea7e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ install: before_script: - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' script: - -PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v - -coverage combine + - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + - coverage combine after_success: coveralls From 3e5b0d71d2461143264d59499e6576f69319d6f0 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 10 Jun 2015 08:18:42 +0530 Subject: [PATCH 202/479] Travis yml fix --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 66b5fe40..97b2eb5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,10 @@ install: - pip install coveralls before_script: - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' + - export COVERAGE_PROCESS_START=$PWD/.coveragerc script: - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text --rcfile=.coveragerc setup.py test -v + - coverage combine + - coverage report after_success: coveralls From 78d7534765084522ce6b4655ed6e02ffe6fa596b Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 10 Jun 2015 18:12:50 +0530 Subject: [PATCH 203/479] Another travis fix attempt. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 97b2eb5a..a744223e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,4 +16,4 @@ script: - coverage combine - coverage report after_success: - coveralls + coveralls --rcfile=.coveragerc From 0f81ccf034eb00ded1ed93a09bc9da1f4f9399bc Mon Sep 17 00:00:00 2001 From: Albert Berger Date: Thu, 11 Jun 2015 02:50:07 +0300 Subject: [PATCH 204/479] Added --mark-code option. --- README.md | 1 + html2text/__init__.py | 6 ++++++ html2text/cli.py | 8 ++++++++ html2text/config.py | 1 + 4 files changed, 16 insertions(+) diff --git a/README.md b/README.md index 0385446e..6226bf2f 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ Usage: `html2text [(filename|url) [encoding]]` | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. | `--single-line-break` | Use a single line break after a block element rather than two. | `--reference-links` | Use reference links instead of links to create markdown +| `--mark-code` | Mark preformatted and code blocks with [code]...[/code] diff --git a/html2text/__init__.py b/html2text/__init__.py index b4f1530e..bd47666f 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -70,6 +70,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.strong_mark = '**' self.single_line_break = config.SINGLE_LINE_BREAK self.use_automatic_links = config.USE_AUTOMATIC_LINKS + self.mark_code = config.MARK_CODE if out is None: self.out = self.outtextf @@ -561,6 +562,8 @@ def handle_tag(self, tag, attrs, start): self.pre = 1 else: self.pre = 0 + if self.mark_code: + self.out("\n[/code]") self.p() # TODO: Add docstring for these one letter functions @@ -607,6 +610,9 @@ def o(self, data, puredata=0, force=0): #self.out(" :") #TODO: not output when already one there if not data.startswith("\n"): #
      stuff...
                           data = "\n" + data
      +                if self.mark_code:
      +                    self.out("\n[code]")
      +                    self.p_p = 0
       
                   bq = (">" * self.blockquote)
                   if not (force and data and data[0] == ">") and self.blockquote:
      diff --git a/html2text/cli.py b/html2text/cli.py
      index a2de1d20..b7239c9c 100644
      --- a/html2text/cli.py
      +++ b/html2text/cli.py
      @@ -131,6 +131,13 @@ def main():
                   "line breaks. NOTE: Requires --body-width=0"
               )
           )
      +    p.add_option(
      +        "--mark-code",
      +        action="store_true",
      +        dest="mark_code",
      +        default=config.MARK_CODE,
      +        help="Mark program code blocks with [code]...[/code]"
      +    )    
           (options, args) = p.parse_args()
       
           # process input
      @@ -190,5 +197,6 @@ def main():
           h.bypass_tables = options.bypass_tables
           h.single_line_break = options.single_line_break
           h.inline_links = options.inline_links
      +    h.mark_code = options.mark_code
       
           wrapwrite(h.handle(data))
      diff --git a/html2text/config.py b/html2text/config.py
      index 07881fd9..e5f5c245 100644
      --- a/html2text/config.py
      +++ b/html2text/config.py
      @@ -32,6 +32,7 @@
       IMAGES_TO_ALT = False
       IMAGES_WITH_SIZE = False
       IGNORE_EMPHASIS = False
      +MARK_CODE = False
       
       # Convert links with same href and text to  format if they are absolute links
       USE_AUTOMATIC_LINKS = True
      
      From c08f281d1fe89f24b39b2378ca2454130dab0479 Mon Sep 17 00:00:00 2001
      From: Albert Berger 
      Date: Thu, 11 Jun 2015 12:16:24 +0300
      Subject: [PATCH 205/479] Added tests and doc for --mark_code
      
      ---
       docs/usage.md          |  1 +
       test/mark_code.html    | 12 ++++++++++++
       test/mark_code.md      | 13 +++++++++++++
       test/test_html2text.py |  4 ++++
       4 files changed, 30 insertions(+)
       create mode 100644 test/mark_code.html
       create mode 100644 test/mark_code.md
      
      diff --git a/docs/usage.md b/docs/usage.md
      index 2605f518..324ee7f4 100644
      --- a/docs/usage.md
      +++ b/docs/usage.md
      @@ -78,6 +78,7 @@ simple indications of their function.
           - RE_SLASH_CHARS a string of slash escapeable characters
           - RE_MD_BACKSLASH_MATCHER to match \char
           - USE_AUTOMATIC_LINKS to convert http://xyz to 
      +    - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags
       
       To alter any option the procedure is to create a parser with
       `parser = html2text.HTML2Text()` and to set the option on the parser.
      diff --git a/test/mark_code.html b/test/mark_code.html
      new file mode 100644
      index 00000000..eed53c5b
      --- /dev/null
      +++ b/test/mark_code.html
      @@ -0,0 +1,12 @@
      +
      +  
      +

      Normal text with 'pre' code block.

      +
      +import os
      +
      +def function():
      +    a = 1
      +
      +

      Normal text continues.

      + + diff --git a/test/mark_code.md b/test/mark_code.md new file mode 100644 index 00000000..40ca81e1 --- /dev/null +++ b/test/mark_code.md @@ -0,0 +1,13 @@ +Normal text with 'pre' code block. + +[code] + + import os + + def function(): + a = 1 + +[/code] + +Normal text continues. + diff --git a/test/test_html2text.py b/test/test_html2text.py index 60cd0a11..f6e45abf 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -149,6 +149,10 @@ def test_cmd(self): module_args['inline_links'] = False cmdline_args.append('--reference-links') + if base_fn.startswith('mark_code'): + module_args['mark_code'] = True + cmdline_args.append('--mark-code') + return test_mod, test_cmd # Originally from http://stackoverflow.com/questions/32899/\ From 9440f588aa011cd9d99351d60bd9009edc33559d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 12 Jun 2015 10:59:26 +0400 Subject: [PATCH 206/479] Add Albert Berger to Authors Thanks Albert Berger @nbdsp for contribution --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index aa55be86..7f5fbb8b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -16,6 +16,7 @@ The AUTHORS/Contributors are (and/or have been): * Peter Wu * Arjoonn Sharma * Ali Mohammad +* Albert Berger Maintainer: From ff0236cc6133d7ebc1622905b29180c0dfc16be6 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 12 Jun 2015 11:01:53 +0400 Subject: [PATCH 207/479] Update changelog with latest changes Ready to release ;) --- ChangeLog.rst | 7 +++++++ html2text/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index cee71f91..5c7e81e6 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +2015.6.12 +========= + +* Feature #76: Making ``pre`` blocks clearer for further automatic formatting. +* Fix #71: Coverage detects tests carried out in ``subprocesses`` + + 2015.6.6 ======== diff --git a/html2text/__init__.py b/html2text/__init__.py index bd47666f..e4824caf 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -29,7 +29,7 @@ skipwrap ) -__version__ = (2015, 6, 6) +__version__ = (2015, 6, 12) # TODO: From dc42444f71727898c3208aa873297df7df730ca0 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 12 Jun 2015 11:05:08 +0400 Subject: [PATCH 208/479] Add hr lines to changelog --- ChangeLog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 5c7e81e6..8cf59694 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,5 +1,6 @@ 2015.6.12 ========= +---- * Feature #76: Making ``pre`` blocks clearer for further automatic formatting. * Fix #71: Coverage detects tests carried out in ``subprocesses`` @@ -7,6 +8,7 @@ 2015.6.6 ======== +---- * Fix #24: ``3.200.3`` vs ``2014.7.3`` output quirks. * Fix #61. Malformed links in markdown output. From 6ad25079f206827835fef3217d239c59390a951e Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 12 Jun 2015 11:07:03 +0400 Subject: [PATCH 209/479] Update Pull guidelines --- docs/contributing.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/contributing.md b/docs/contributing.md index 9507f58f..41a2ce07 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -6,4 +6,7 @@ Pull guidelines --------------- - Make the changes modular. The usual method is one change per commit. +- Add tests. (We love tests). +- Update the ``ChangeLog.rst``. +- Add yourself to ``AUTHORS.rst`` if you're not listed. - That is all From 1fea72e4c4b9cbeedc3660bbff9c5a6f3142ec63 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sun, 14 Jun 2015 10:35:00 +0530 Subject: [PATCH 210/479] HTML entities no longer get out of brackets. #31 Changed call order of handle_charref and handle_entityref. Error was caused due to first element of link title being charref and thus calling handle_charref instead of handle_data where the '[' is inserted. --- html2text/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6f918936..50fb0a56 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -166,13 +166,13 @@ def handle_charref(self, c): charref = self.charref(c) if not self.code and not self.pre: charref = cgi.escape(charref) - self.o(charref, 1) + self.handle_data(charref, True) def handle_entityref(self, c): entityref = self.entityref(c) if not self.code and not self.pre and entityref != ' _place_holder;': entityref = cgi.escape(entityref) - self.o(entityref, 1) + self.handle_data(entityref, True) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -565,13 +565,16 @@ def handle_tag(self, tag, attrs, start): # TODO: Add docstring for these one letter functions def pbr(self): + "Pretty print has a line break" if self.p_p == 0: self.p_p = 1 def p(self): + "Set pretty print to 1 or 2 lines" self.p_p = 1 if self.single_line_break else 2 def soft_br(self): + "Soft breaks" self.pbr() self.br_toggle = ' ' @@ -677,7 +680,7 @@ def o(self, data, puredata=0, force=0): self.out(data) self.outcount += 1 - def handle_data(self, data): + def handle_data(self, data, entity_char=False): if r'\/script>' in data: self.quiet -= 1 @@ -695,7 +698,7 @@ def handle_data(self, data): self.maybe_automatic_link = None self.empty_link = False - if not self.code and not self.pre: + if not self.code and not self.pre and not entity_char: data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) From 9bcd381f1121853962e66421f64bc21a08f61427 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sun, 14 Jun 2015 10:49:11 +0530 Subject: [PATCH 211/479] - Added tests for last fix. - Updated coveragerc. --- .coveragerc | 18 ++++++++++++++++++ test/html_entities_out_of_text.html | 1 + test/html_entities_out_of_text.md | 2 ++ 3 files changed, 21 insertions(+) create mode 100644 test/html_entities_out_of_text.html create mode 100644 test/html_entities_out_of_text.md diff --git a/.coveragerc b/.coveragerc index c03d93f3..11441c9c 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,3 +3,21 @@ branch=True parallel=True cover_pylib=False source=./html2text + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: diff --git a/test/html_entities_out_of_text.html b/test/html_entities_out_of_text.html new file mode 100644 index 00000000..1b062c9e --- /dev/null +++ b/test/html_entities_out_of_text.html @@ -0,0 +1 @@ +állás: Country Manager diff --git a/test/html_entities_out_of_text.md b/test/html_entities_out_of_text.md new file mode 100644 index 00000000..ee4b0a79 --- /dev/null +++ b/test/html_entities_out_of_text.md @@ -0,0 +1,2 @@ +[allas: Country Manager](http://thth) + From e56982b7c0a4bcdaf3e861ddbd507c2f897e8e1e Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sun, 14 Jun 2015 10:55:19 +0530 Subject: [PATCH 212/479] Removed testing of import statements --- html2text/compat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html2text/compat.py b/html2text/compat.py index b02fa17e..166c0e73 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -1,12 +1,12 @@ -try: +try: # pragma: no cover import urllib.parse as urlparse import html.entities as htmlentitydefs import html.parser as HTMLParser -except ImportError: # Python2 +except ImportError: # pragma: no cover import htmlentitydefs import urlparse import HTMLParser -try: # Python3 +try: # pragma: no cover import urllib.request as urllib -except ImportError: +except ImportError: # pragma: no cover import urllib From 9e7d55ae4edada802ca8b0bc756ae6c59918440c Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sun, 14 Jun 2015 12:44:02 +0530 Subject: [PATCH 213/479] Changelog update --- ChangeLog.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 4d5fe984..2e0911e5 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,12 @@ +0000.0.00 +========= +---- + +* Fix #31: HTML entities stay inside link +* Fix #71: Coverage detects command line tests +* Fix #39: Documentation update +* Fix #61: Functionality added for optional use of automatic links + 2015.4.14 ========= ---- From 098c63ec00a5a3b8ed62367b00fc141bc8268ab3 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sun, 14 Jun 2015 13:25:47 +0530 Subject: [PATCH 214/479] Merge conflict fix. --- ChangeLog.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 2e0911e5..94876a04 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,12 +1,3 @@ -0000.0.00 -========= ----- - -* Fix #31: HTML entities stay inside link -* Fix #71: Coverage detects command line tests -* Fix #39: Documentation update -* Fix #61: Functionality added for optional use of automatic links - 2015.4.14 ========= ---- @@ -110,3 +101,13 @@ * Fix #1: Add ``ChangeLog.rst`` file. * Fix #2: Add ``AUTHORS.rst`` file. + +0000.0.00 +========= +---- + +* Fix #31: HTML entities stay inside link +* Fix #71: Coverage detects command line tests +* Fix #39: Documentation update +* Fix #61: Functionality added for optional use of automatic links + From 0bfa8197cdb76a619c90425b99242c0a6666c6b5 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 09:21:52 +0530 Subject: [PATCH 215/479] Fix for #74. Invlid unicode ignored --- html2text/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/html2text/__init__.py b/html2text/__init__.py index 50fb0a56..6b940486 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -719,6 +719,8 @@ def charref(self, name): return unichr(c) except NameError: # Python3 return chr(c) + except ValueError: # invalid unicode + return '' def entityref(self, c): if not self.unicode_snob and c in config.UNIFIABLE.keys(): From e38df3b4787ba6138f27c312d3d3b5bf747b251a Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 09:29:11 +0530 Subject: [PATCH 216/479] Tests for invalid unicode entities --- test/invalid_unicode.html | 1 + test/invalid_unicode.md | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 test/invalid_unicode.html create mode 100644 test/invalid_unicode.md diff --git a/test/invalid_unicode.html b/test/invalid_unicode.html new file mode 100644 index 00000000..3dd8b18a --- /dev/null +++ b/test/invalid_unicode.html @@ -0,0 +1 @@ +B�r diff --git a/test/invalid_unicode.md b/test/invalid_unicode.md new file mode 100644 index 00000000..865cc856 --- /dev/null +++ b/test/invalid_unicode.md @@ -0,0 +1,2 @@ +Br + From 2dce0b27a321f9c1a1ae88ea5fbe3fad50d02eb7 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 10:06:38 +0530 Subject: [PATCH 217/479] Added appropriate ignore lines in cli.py --- html2text/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index a3eb7da9..5cf5e940 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -133,7 +133,7 @@ def main(): # process input encoding = "utf-8" - if len(args) > 0 and args[0] != '-': + if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] if len(args) == 2: encoding = args[1] From aa7b8981fe23e5d8a1c074d5435b1cd44ee7a68d Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 11:25:17 +0530 Subject: [PATCH 218/479] Testd on various python versions. Retrying travis build --- html2text/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6b940486..ca6a5d50 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -716,9 +716,10 @@ def charref(self, name): return unifiable_n[c] else: try: - return unichr(c) - except NameError: # Python3 - return chr(c) + try: + return unichr(c) + except NameError: # Python3 + return chr(c) except ValueError: # invalid unicode return '' From ad6a0591ed48c68cb3662ddb1526c2ed441a8199 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 18:18:41 +0530 Subject: [PATCH 219/479] Added failing tests for link titles --- test/link_titles.html | 1 + test/link_titles.md | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 test/link_titles.html create mode 100644 test/link_titles.md diff --git a/test/link_titles.html b/test/link_titles.html new file mode 100644 index 00000000..46ddcde1 --- /dev/null +++ b/test/link_titles.html @@ -0,0 +1 @@ + an example inline link diff --git a/test/link_titles.md b/test/link_titles.md new file mode 100644 index 00000000..711077e8 --- /dev/null +++ b/test/link_titles.md @@ -0,0 +1,2 @@ +[ an example](http://example.com/ "MyTitle") + From a12aa5846511dc4e3cfaf6025623dd1f9b0b06ac Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 18:32:52 +0530 Subject: [PATCH 220/479] Link titles are not lost in inline links. Earlier they only came up in reference style links. Now they are included regardless of link type. --- html2text/__init__.py | 9 ++++++++- test/link_titles.html | 4 +++- test/link_titles.md | 3 ++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index ca6a5d50..a9789599 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -200,6 +200,7 @@ def previousIndex(self, attrs): if ('title' in a) or ('title' in attrs): if (('title' in a) and ('title' in attrs) and a['title'] == attrs['title']): + print(a['title'], attrs['title']) match = True else: match = True @@ -404,7 +405,13 @@ def handle_tag(self, tag, attrs, start): self.empty_link = False self.maybe_automatic_link = None if self.inline_links: - self.o("](" + escape_md(a['href']) + ")") + try: + title = escape_md(a['title']) + except KeyError: + self.o("](" + escape_md(a['href']) + ")") + else: + self.o("](" + escape_md(a['href']) + + ' "' + title + '" )') else: i = self.previousIndex(a) if i is not None: diff --git a/test/link_titles.html b/test/link_titles.html index 46ddcde1..ceba0d52 100644 --- a/test/link_titles.html +++ b/test/link_titles.html @@ -1 +1,3 @@ - an example inline link + first example +
      + second example diff --git a/test/link_titles.md b/test/link_titles.md index 711077e8..d68fb7e8 100644 --- a/test/link_titles.md +++ b/test/link_titles.md @@ -1,2 +1,3 @@ -[ an example](http://example.com/ "MyTitle") +[ first example](http://example.com "MyTitle" ) +[ second example](http://example.com) From e9824cb044a5c31c10c896170f454b29b99c11f8 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Wed, 17 Jun 2015 18:46:34 +0530 Subject: [PATCH 221/479] Removed a debug statement. --- html2text/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index a9789599..fcf06b6b 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -200,7 +200,6 @@ def previousIndex(self, attrs): if ('title' in a) or ('title' in attrs): if (('title' in a) and ('title' in attrs) and a['title'] == attrs['title']): - print(a['title'], attrs['title']) match = True else: match = True From 0aadbd4f30e80956e19b56a4025fb05e4bcb8be2 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Thu, 18 Jun 2015 15:06:49 +0530 Subject: [PATCH 222/479] Changelog update --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 94876a04..248ab9d2 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -110,4 +110,4 @@ * Fix #71: Coverage detects command line tests * Fix #39: Documentation update * Fix #61: Functionality added for optional use of automatic links - +* Feature #80: 'title' attribute is preserved in both inline and reference links From c3ab0ba7edb2b0e6fbf09c356c6740a29aff32a7 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Fri, 19 Jun 2015 09:18:17 +0530 Subject: [PATCH 223/479] Broke down the main function which was too complex as per pep8. process_input is still too complex but can be simplified. --- html2text/cli.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index 5cf5e940..ca91a39c 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -5,11 +5,7 @@ from html2text.utils import wrapwrite, wrap_read -def main(): - baseurl = '' - - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', - version='%prog ' + __version__) +def add_options(p): p.add_option( "--ignore-emphasis", dest="ignore_emphasis", @@ -129,11 +125,11 @@ def main(): "line breaks. NOTE: Requires --body-width=0" ) ) - (options, args) = p.parse_args() + return p - # process input +def process_input(options, args, p, baseurl): # pragma: no cover encoding = "utf-8" - if len(args) > 0 and args[0] != '-': # pragma: no cover + if len(args) > 0 and args[0] != '-': file_ = args[0] if len(args) == 2: encoding = args[1] @@ -165,7 +161,9 @@ def main(): if hasattr(data, 'decode'): data = data.decode(encoding) + handle_options(baseurl, options, data) +def handle_options(baseurl, options, data): h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: @@ -190,3 +188,11 @@ def main(): h.inline_links = options.inline_links wrapwrite(h.handle(data)) + +def main(): + baseurl = '' + p = optparse.OptionParser('%prog [(filename|url) [encoding]]', + version='%prog ' + __version__) + p = add_options(p) + (options, args) = p.parse_args() + process_input(options, args, p, baseurl) From 558ad492b546a545d7c8586d115b4c63914891eb Mon Sep 17 00:00:00 2001 From: theSage21 Date: Fri, 19 Jun 2015 09:49:23 +0530 Subject: [PATCH 224/479] Added all options as command line flags. --- html2text/__init__.py | 37 +++++++++++++++++++------------------ html2text/cli.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index fcf06b6b..4cc89773 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -50,26 +50,27 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.split_next_td = False self.td_count = 0 self.table_start = False - self.unicode_snob = config.UNICODE_SNOB - self.escape_snob = config.ESCAPE_SNOB + self.unicode_snob = config.UNICODE_SNOB # covered in cli + self.escape_snob = config.ESCAPE_SNOB # covered in cli self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH - self.body_width = bodywidth - self.skip_internal_links = config.SKIP_INTERNAL_LINKS - self.inline_links = config.INLINE_LINKS - self.protect_links = config.PROTECT_LINKS - self.google_list_indent = config.GOOGLE_LIST_INDENT - self.ignore_links = config.IGNORE_ANCHORS - self.ignore_images = config.IGNORE_IMAGES - self.images_to_alt = config.IMAGES_TO_ALT - self.images_with_size = config.IMAGES_WITH_SIZE - self.ignore_emphasis = config.IGNORE_EMPHASIS - self.bypass_tables = config.BYPASS_TABLES - self.google_doc = False - self.ul_item_mark = '*' - self.emphasis_mark = '_' + self.body_width = bodywidth # covered in cli + self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli + self.inline_links = config.INLINE_LINKS # covered in cli + self.protect_links = config.PROTECT_LINKS # covered in cli + self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli + self.ignore_links = config.IGNORE_ANCHORS # covered in cli + self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_to_alt = config.IMAGES_TO_ALT # covered in cli + self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli + self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli + self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.google_doc = False # covered in cli + self.ul_item_mark = '*' # covered in cli + self.emphasis_mark = '_' # covered in cli self.strong_mark = '**' - self.single_line_break = config.SINGLE_LINE_BREAK - self.use_automatic_links = config.USE_AUTOMATIC_LINKS + self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli + self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli + self.hide_strikethrough = False # covered in cli if out is None: self.out = self.outtextf diff --git a/html2text/cli.py b/html2text/cli.py index ca91a39c..11e441a9 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -125,6 +125,34 @@ def add_options(p): "line breaks. NOTE: Requires --body-width=0" ) ) + p.add_option( + "--unicode-snob", + action="store_true", + dest="unicode_snob", + default=config.UNICODE_SNOB, + help="Use unicode throughout document" + ) + p.add_option( + "--no-automatic-links", + action="store_false", + dest="use_automatic_links", + default=config.USE_AUTOMATIC_LINKS, + help="Do not use automatic links wherever applicable" + ) + p.add_option( + "--no-skip-internal-links", + action="store_false", + dest="skip_internal_links", + default=config.SKIP_INTERNAL_LINKS, + help="Do not skip internal links" + ) + p.add_option( + "--links-after-para", + action="store_true", + dest="links_each_paragraph", + default=config.LINKS_EACH_PARAGRAPH, + help="Put links after each paragraph instead of document" + ) return p def process_input(options, args, p, baseurl): # pragma: no cover @@ -173,7 +201,7 @@ def handle_options(baseurl, options, data): h.strong_mark = '__' h.body_width = options.body_width - h.list_indent = options.list_indent + h.google_list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links @@ -186,6 +214,10 @@ def handle_options(baseurl, options, data): h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links + h.unicode_snob = options.unicode_snob + h.use_automatic_links = options.use_automatic_links + h.skip_internal_links = options.skip_internal_links + h.links_each_paragraph = options.links_each_paragraph wrapwrite(h.handle(data)) From 0a5b976aa650486f05a6580769c7f7f48b8ab309 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Fri, 19 Jun 2015 10:59:35 +0530 Subject: [PATCH 225/479] Some more testing --- test/GoogleDocSaved.html | 2 +- test/header_tags.html | 22 ++++++++++----------- test/header_tags.md | 18 ++++++++--------- test/list_tags_example.html | 39 +++++++++++++++++++++++++++++++++++++ test/list_tags_example.md | 36 ++++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 21 deletions(-) create mode 100644 test/list_tags_example.html create mode 100644 test/list_tags_example.md diff --git a/test/GoogleDocSaved.html b/test/GoogleDocSaved.html index a0cd19c9..10d80cfb 100644 --- a/test/GoogleDocSaved.html +++ b/test/GoogleDocSaved.html @@ -3,7 +3,7 @@ Sandbox + @import url('https://themes.googleusercontent.com/fonts/css?kit=lhDjYqiy3mZ0x6ROQEUoUw');ol{margin:0;padding:0}p{margin:0}.c12{list-style-type:disc;margin:0;padding:0;text-decoration:none;}.c8{width:468pt;background-color:#ffffff;padding:72pt 72pt 72pt 72pt}.c2{padding-left:0pt;direction:ltr;margin-left:36pt}.c11{list-style-type:lower-latin;margin:0;padding:0}.c4{list-style-type:circle;margin:0;padding:0}.c1{padding-left:0pt;direction:ltr;margin-left:72pt}.c7{;margin:0;padding:0}.c3{font-style:italic;font-family:Courier New}.c0{height:11pt;direction:ltr}.c5{font-weight:bold}.c9{font-family:Consolas}.c13{font-family:Courier New}.c6{direction:ltr}.c10{font-style:italic}body{color:#000000;font-size:11pt;font-family:Arial}h1{padding-top:24pt;color:#000000;font-size:24pt;font-family:Arial;font-weight:bold;padding-bottom:6pt}h2{padding-top:18pt;color:#000000;font-size:18pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h3{padding-top:14pt;color:#000000;font-size:14pt;font-family:Arial;font-weight:bold;padding-bottom:4pt}h4{padding-top:12pt;color:#000000;font-size:12pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h5{padding-top:11pt;color:#000000;font-size:11pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}h6{padding-top:10pt;color:#000000;font-size:10pt;font-family:Arial;font-weight:bold;padding-bottom:2pt}

      diff --git a/test/header_tags.html b/test/header_tags.html index aad3e661..0e257936 100644 --- a/test/header_tags.html +++ b/test/header_tags.html @@ -1,17 +1,17 @@

      H1

      -

      H1

      -

      H1

      -

      H1

      -
      H1
      -
      H1
      - H1 - H1 - H1 - H1 - H1 - H1 +

      H2

      +

      H3

      +

      H4

      +
      H5
      +
      H6
      + H7 + H8 + H9 + H10 + H11 + H12 NO number diff --git a/test/header_tags.md b/test/header_tags.md index 80a23023..cb1fd31e 100644 --- a/test/header_tags.md +++ b/test/header_tags.md @@ -1,20 +1,20 @@ # H1 -## H1 +## H2 -### H1 +### H3 -#### H1 +#### H4 -##### H1 +##### H5 -###### H1 +###### H6 -####### H1 +####### H7 -######## H1 +######## H8 -######### H1 +######### H9 -H1 H1 H1 NO number +H10 H11 H12 NO number diff --git a/test/list_tags_example.html b/test/list_tags_example.html new file mode 100644 index 00000000..d0a8be2d --- /dev/null +++ b/test/list_tags_example.html @@ -0,0 +1,39 @@ +
      +
      Definition List
      +
      A list of terms and their definitions/descriptions.
      +
      Ordered List
      +
      A numbered list.
      +
      Unordered List
      +
      An unnumbered list.
      +
      + +

      Example 2

      +
      +
      Vocals
      +
      Bruce Dickinson
      +
      Guitar
      +
      Adrian Smith
      +
      Dave Murray
      +
      Janick Gers
      +
      Bass
      +
      Steve Harris
      +
      Drums
      +
      Nicko McBrain
      +
      + +
        +
      • some item
      • +
      • Some other item
      • +
      • some item
      • +
      + +
        +
      1. Some other item
      2. +
      3. some item
      4. +
      5. some item
      6. +
      + +
        +
      • somthing else here
      • +
      • some item
      • +
      diff --git a/test/list_tags_example.md b/test/list_tags_example.md new file mode 100644 index 00000000..b9723699 --- /dev/null +++ b/test/list_tags_example.md @@ -0,0 +1,36 @@ +Definition List + + A list of terms and their definitions/descriptions. +Ordered List + + A numbered list. +Unordered List + + An unnumbered list. + +#### Example 2 + +Vocals + + Bruce Dickinson +Guitar + + Adrian Smith + Dave Murray + Janick Gers +Bass + + Steve Harris +Drums + + Nicko McBrain + + * some item + * Some other item + * some item + 1. Some other item + 2. some item + 3. some item + * somthing else here + * some item + From 9f42c658140595ec7f6abfb6ee4067668efdc4d4 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Fri, 19 Jun 2015 13:01:32 +0530 Subject: [PATCH 226/479] More tests and coverage things --- html2text/__init__.py | 11 +-- html2text/utils.py | 6 +- test/GoogleDocSaved_two.html | 147 ++++++++++++++++++++++++++++++ test/GoogleDocSaved_two.md | 0 test/abbr_tag.html | 1 + test/abbr_tag.md | 2 +- test/anchor-undefined-href.html | 5 - test/anchor-undefined-href.md | 2 - test/decript_tage.html | 3 + test/decript_tage.md | 2 + test/dl_tag_example.html | 22 ----- test/dl_tag_example.md | 27 ------ test/flip_emphasis.html | 2 + test/flip_emphasis.md | 2 + test/images_with_size.html | 5 +- test/images_with_size.md | 3 +- test/no_inline_links_example.html | 8 ++ test/no_inline_links_example.md | 7 +- 18 files changed, 186 insertions(+), 69 deletions(-) create mode 100644 test/GoogleDocSaved_two.html create mode 100644 test/GoogleDocSaved_two.md delete mode 100644 test/anchor-undefined-href.html delete mode 100644 test/anchor-undefined-href.md create mode 100644 test/decript_tage.html create mode 100644 test/decript_tage.md delete mode 100644 test/dl_tag_example.html delete mode 100644 test/dl_tag_example.md create mode 100644 test/flip_emphasis.html create mode 100644 test/flip_emphasis.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 4cc89773..64fa6b54 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -7,7 +7,7 @@ try: from textwrap import wrap -except ImportError: +except ImportError: # pragma: no cover pass from html2text.compat import urlparse, HTMLParser @@ -72,9 +72,9 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli self.hide_strikethrough = False # covered in cli - if out is None: + if out is None: # pragma: no cover self.out = self.outtextf - else: + else: # pragma: no cover self.out = out # empty list to store output characters before they are "joined" @@ -189,9 +189,8 @@ def previousIndex(self, attrs): self.a list. If the set of attributes is not found, returns None :rtype: int """ - if 'href' not in attrs: + if 'href' not in attrs: # pragma: no cover return None - i = -1 for a in self.a: i += 1 @@ -709,7 +708,7 @@ def handle_data(self, data, entity_char=False): data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) - def unknown_decl(self, data): + def unknown_decl(self, data): # pragma: no cover # TODO: what is this doing here? pass diff --git a/html2text/utils.py b/html2text/utils.py index 1bc4102e..04364820 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -21,7 +21,7 @@ def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) - if n in range(1, 10): + if n in range(1, 10): # pragma: no branch return n except ValueError: return 0 @@ -62,7 +62,7 @@ def dumb_css_parser(data): try: elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) - except ValueError: + except ValueError: # pragma: no cover elements = {} # not that important return elements @@ -208,7 +208,7 @@ def wrapwrite(text): sys.stdout.write(text) -def wrap_read(): +def wrap_read(): # pragma: no cover """ :rtype: str """ diff --git a/test/GoogleDocSaved_two.html b/test/GoogleDocSaved_two.html new file mode 100644 index 00000000..4b30f492 --- /dev/null +++ b/test/GoogleDocSaved_two.html @@ -0,0 +1,147 @@ + + + + Sandbox + + + +

      + + + + test doc +

      +

      + + first issue +

      +

      + + +

      +
        +
      1. + + bit +
      2. +
      3. + + bold italic +
      4. +
      +
        +
      1. + + orange +
      2. +
      3. + + apple +
      4. +
      +
        +
      1. + + final +
      2. +
      +

      + + +

      +

      + + text to separate lists +

      +

      + + +

      +
        +
      1. + + now with numbers +
      2. +
      3. + + the prisoner +
      4. +
      +
        +
      1. + + not an + + italic number +
      2. +
      3. + + a + + bold human + +  being +
      4. +
      +
        +
      1. + + end +
      2. +
      +

      + + +

      +

      + + bold +

      +

      + + italic +

      +

      + + +

      +

      + + def func(x): +

      +

      + +   if x < 1: +

      +

      + +     return 'a' +

      +

      + +   return 'b' +

      +

      + + +

      +

      + + Some + + fixed width text + +  here +

      +

      + + italic fixed width text +

      +

      + + +

      + + + diff --git a/test/GoogleDocSaved_two.md b/test/GoogleDocSaved_two.md new file mode 100644 index 00000000..e69de29b diff --git a/test/abbr_tag.html b/test/abbr_tag.html index e0e4a33c..b523d6bc 100644 --- a/test/abbr_tag.html +++ b/test/abbr_tag.html @@ -1 +1,2 @@ TLA +xyz diff --git a/test/abbr_tag.md b/test/abbr_tag.md index 0b030ff3..bffbcf21 100644 --- a/test/abbr_tag.md +++ b/test/abbr_tag.md @@ -1,4 +1,4 @@ -TLA +TLA xyz *[TLA]: Three Letter Acronym diff --git a/test/anchor-undefined-href.html b/test/anchor-undefined-href.html deleted file mode 100644 index 20197513..00000000 --- a/test/anchor-undefined-href.html +++ /dev/null @@ -1,5 +0,0 @@ - - - anchor - - diff --git a/test/anchor-undefined-href.md b/test/anchor-undefined-href.md deleted file mode 100644 index 12e6c627..00000000 --- a/test/anchor-undefined-href.md +++ /dev/null @@ -1,2 +0,0 @@ -anchor - diff --git a/test/decript_tage.html b/test/decript_tage.html new file mode 100644 index 00000000..c8826d40 --- /dev/null +++ b/test/decript_tage.html @@ -0,0 +1,3 @@ +something +something +something diff --git a/test/decript_tage.md b/test/decript_tage.md new file mode 100644 index 00000000..db18500d --- /dev/null +++ b/test/decript_tage.md @@ -0,0 +1,2 @@ +something something something + diff --git a/test/dl_tag_example.html b/test/dl_tag_example.html deleted file mode 100644 index 08e629a6..00000000 --- a/test/dl_tag_example.html +++ /dev/null @@ -1,22 +0,0 @@ -
      -
      Definition List
      -
      A list of terms and their definitions/descriptions.
      -
      Ordered List
      -
      A numbered list.
      -
      Unordered List
      -
      An unnumbered list.
      -
      - -

      Example 2

      -
      -
      Vocals
      -
      Bruce Dickinson
      -
      Guitar
      -
      Adrian Smith
      -
      Dave Murray
      -
      Janick Gers
      -
      Bass
      -
      Steve Harris
      -
      Drums
      -
      Nicko McBrain
      -
      diff --git a/test/dl_tag_example.md b/test/dl_tag_example.md deleted file mode 100644 index b78dcabe..00000000 --- a/test/dl_tag_example.md +++ /dev/null @@ -1,27 +0,0 @@ -Definition List - - A list of terms and their definitions/descriptions. -Ordered List - - A numbered list. -Unordered List - - An unnumbered list. - -#### Example 2 - -Vocals - - Bruce Dickinson -Guitar - - Adrian Smith - Dave Murray - Janick Gers -Bass - - Steve Harris -Drums - - Nicko McBrain - diff --git a/test/flip_emphasis.html b/test/flip_emphasis.html new file mode 100644 index 00000000..9152f43a --- /dev/null +++ b/test/flip_emphasis.html @@ -0,0 +1,2 @@ +Something +else diff --git a/test/flip_emphasis.md b/test/flip_emphasis.md new file mode 100644 index 00000000..9c436c53 --- /dev/null +++ b/test/flip_emphasis.md @@ -0,0 +1,2 @@ +*Something* __else__ + diff --git a/test/images_with_size.html b/test/images_with_size.html index bcf8e44c..fcda9b67 100644 --- a/test/images_with_size.html +++ b/test/images_with_size.html @@ -4,4 +4,7 @@ An image with a height attr -An image with width and height \ No newline at end of file +An image with width and height + + + diff --git a/test/images_with_size.md b/test/images_with_size.md index cf710736..8cf35a8b 100644 --- a/test/images_with_size.md +++ b/test/images_with_size.md @@ -2,5 +2,6 @@ src='image_with_width.jpg' width='300' alt='An image with a width attr' /> An image with a height attr An
-image with width and height +image with width and height' /> ![](image_with_width_and_height.jpg) diff --git a/test/no_inline_links_example.html b/test/no_inline_links_example.html index 982f54c5..5e4c45c8 100644 --- a/test/no_inline_links_example.html +++ b/test/no_inline_links_example.html @@ -1 +1,9 @@ Googler + No href + No href but title available + Example + + + +link text + diff --git a/test/no_inline_links_example.md b/test/no_inline_links_example.md index 2593475a..dedea641 100644 --- a/test/no_inline_links_example.md +++ b/test/no_inline_links_example.md @@ -1,4 +1,9 @@ -[Googler][1] +[Googler][1] No href No href but title available [ Example][2] [ [ [ link text +][3]][3]][3] [1]: http://google.com + [2]: http://example.com (Example title) + + [3]: http://example.com (abc) + From fa2a17775c3e244006cc15e8052e14ee29bfee48 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Fri, 19 Jun 2015 13:29:45 +0530 Subject: [PATCH 227/479] Attempt to fix merge conflict --- html2text/cli.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index 11e441a9..ba5c22e7 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -5,7 +5,13 @@ from html2text.utils import wrapwrite, wrap_read -def add_options(p): +def main(): + baseurl = '' + + p = optparse.OptionParser( + '%prog [(filename|url) [encoding]]', + version='%prog ' + ".".join(map(str, __version__)) + ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", @@ -152,12 +158,12 @@ def add_options(p): dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document" - ) - return p + ) + (options, args) = p.parse_args() -def process_input(options, args, p, baseurl): # pragma: no cover + # process input encoding = "utf-8" - if len(args) > 0 and args[0] != '-': + if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] if len(args) == 2: encoding = args[1] @@ -189,9 +195,7 @@ def process_input(options, args, p, baseurl): # pragma: no cover if hasattr(data, 'decode'): data = data.decode(encoding) - handle_options(baseurl, options, data) -def handle_options(baseurl, options, data): h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: @@ -220,11 +224,3 @@ def handle_options(baseurl, options, data): h.links_each_paragraph = options.links_each_paragraph wrapwrite(h.handle(data)) - -def main(): - baseurl = '' - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', - version='%prog ' + __version__) - p = add_options(p) - (options, args) = p.parse_args() - process_input(options, args, p, baseurl) From ef8af12cda330923a7eebfa6782c6be400f352ca Mon Sep 17 00:00:00 2001 From: theSage21 Date: Fri, 19 Jun 2015 14:12:36 +0530 Subject: [PATCH 228/479] - Updated docs. - Update changelog --- ChangeLog.rst | 1 + docs/how_it_works.md | 2 +- docs/usage.md | 31 +++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 594a8c2e..6a58d72c 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -133,3 +133,4 @@ * Fix #39: Documentation update * Fix #61: Functionality added for optional use of automatic links * Feature #80: 'title' attribute is preserved in both inline and reference links +* Feature #82: More command line options. See docs. diff --git a/docs/how_it_works.md b/docs/how_it_works.md index 8d7df933..c06430b3 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -95,7 +95,7 @@ Command line interface for the code. | `--single-line-break` | Use a single line break after a block element rather than two. | `--reference-links` | Use reference links instead of inline links to create markdown - +*A complete list is available [here](usage.md)* __init__.py ----------- diff --git a/docs/usage.md b/docs/usage.md index 324ee7f4..02aa81a5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -83,3 +83,34 @@ simple indications of their function. To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. example: `parser.unicode_snob = True` to set the UNICODE_SNOB option. + + +Command line options +-------------------- + + +| Option | Description +|--------------------------------------------------------|--------------------------------------------------- +| `--version` | Show program version number and exit +| `-h`, `--help` | Show this help message and exit +| `--ignore-links` | Do not include any formatting for links +|`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets +|`--ignore-images` | Do not include any formatting for images +|`--images-to-alt` | Discard image data, only keep alt text +|`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions +|`-g`, `--google-doc` | Convert an html-exported Google Document +|`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items +|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap +|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists +|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well +|`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. +| `--bypass-tables` | Format tables in HTML rather than Markdown syntax. +| `--single-line-break` | Use a single line break after a block element rather than two. +| `--reference-links` | Use reference links instead of inline links to create markdown +| `--ignore-emphasis` | Ignore all emphasis formatting in the html. +| `-e`, `--asterisk-emphasis` | Use asterisk rather than underscore to emphasize text +| `--unicode-snob` | Use unicode throughout instead of ASCII +| `--no-automatic-links` | Do not use automatic links like +| `--no-skip-internal-links` | Turn off skipping of internal links +| `--links-after-para` | Put the links after the paragraph and not at end of document +| `--mark-code` | Mark code with [code]...[/code] blocks From 79c2c7f3dd8531b6db5ddd577550484e241e487a Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:23:56 +0400 Subject: [PATCH 229/479] Add .coverage.* to be ignored --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index de204b56..b1d87fe3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ dist *.egg-info .idea .coverage +.coverage.* env/ From 29e84aa035a5c40b349113ef90b6a436de729c55 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:24:45 +0400 Subject: [PATCH 230/479] Changelog entry should be in the top --- ChangeLog.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 6a58d72c..140fe8e2 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,15 @@ +0000.0.00 +========= +---- + +* Fix #31: HTML entities stay inside link +* Fix #71: Coverage detects command line tests +* Fix #39: Documentation update +* Fix #61: Functionality added for optional use of automatic links +* Feature #80: 'title' attribute is preserved in both inline and reference links +* Feature #82: More command line options. See docs. + + 2015.6.12 ========= ---- @@ -123,14 +135,3 @@ * Fix #1: Add ``ChangeLog.rst`` file. * Fix #2: Add ``AUTHORS.rst`` file. - -0000.0.00 -========= ----- - -* Fix #31: HTML entities stay inside link -* Fix #71: Coverage detects command line tests -* Fix #39: Documentation update -* Fix #61: Functionality added for optional use of automatic links -* Feature #80: 'title' attribute is preserved in both inline and reference links -* Feature #82: More command line options. See docs. From 3b8904a7d97d6bc2745abbc3149bf4daf7e2a50d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:25:32 +0400 Subject: [PATCH 231/479] Fix formatting for changelog --- ChangeLog.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 140fe8e2..72d7d0fa 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,11 +2,11 @@ ========= ---- -* Fix #31: HTML entities stay inside link -* Fix #71: Coverage detects command line tests -* Fix #39: Documentation update -* Fix #61: Functionality added for optional use of automatic links -* Feature #80: 'title' attribute is preserved in both inline and reference links +* Fix #31: HTML entities stay inside link. +* Fix #71: Coverage detects command line tests. +* Fix #39: Documentation update. +* Fix #61: Functionality added for optional use of automatic links. +* Feature #80: ``title`` attribute is preserved in both inline and reference links. * Feature #82: More command line options. See docs. From 750f905937b85ea00d770a9bb6265203373946d0 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:28:42 +0400 Subject: [PATCH 232/479] More than 79 is not acceptable in the universe --- html2text/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 3ca49050..699a7c97 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -172,7 +172,8 @@ def handle_charref(self, c): def handle_entityref(self, c): entityref = self.entityref(c) - if not self.code and not self.pre and entityref != ' _place_holder;': + if (not self.code and not self.pre + and entityref != ' _place_holder;'): entityref = cgi.escape(entityref) self.handle_data(entityref, True) @@ -278,8 +279,9 @@ def handle_tag(self, tag, attrs, start): attrs = dict(attrs) # first thing inside the anchor tag is another tag that produces some output - if start and not self.maybe_automatic_link is None and \ - tag not in ['p', 'div', 'style', 'dl', 'dt'] and (tag != "img" or self.ignore_images): + if (start and not self.maybe_automatic_link is None + and tag not in ['p', 'div', 'style', 'dl', 'dt'] + and (tag != "img" or self.ignore_images)): self.o("[") self.maybe_automatic_link = None self.empty_link = False @@ -701,7 +703,8 @@ def handle_data(self, data, entity_char=False): if not self.maybe_automatic_link is None: href = self.maybe_automatic_link - if href == data and self.absolute_url_matcher.match(href) and self.use_automatic_links: + if (href == data and self.absolute_url_matcher.match(href) + and self.use_automatic_links): self.o("<" + data + ">") self.empty_link = False return From d3ba69f0fb5dd74670d0963f90fc7a79213a5735 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:29:34 +0400 Subject: [PATCH 233/479] Style is important --- sitecustomize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sitecustomize.py b/sitecustomize.py index 8a045a92..973a2655 100644 --- a/sitecustomize.py +++ b/sitecustomize.py @@ -1,5 +1,8 @@ import os + import coverage + + here = os.getcwd() config_file = os.path.join(here, '.coveragerc') os.environ['COVERAGE_PROCESS_START'] = config_file From bb20b071dc5e37846e1d44e5b126adea334fc673 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:31:31 +0400 Subject: [PATCH 234/479] Pretty is important --- html2text/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index 04364820..f7a1d1e4 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -1,6 +1,6 @@ import sys -from html2text import config +from html2text import config from html2text.compat import htmlentitydefs From 9b07bb347f9eba8897970bcd1a8482a4c62f99be Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:34:27 +0400 Subject: [PATCH 235/479] try except block is slow --- html2text/compat.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/html2text/compat.py b/html2text/compat.py index 166c0e73..2120a41b 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -1,12 +1,13 @@ -try: # pragma: no cover - import urllib.parse as urlparse - import html.entities as htmlentitydefs - import html.parser as HTMLParser -except ImportError: # pragma: no cover +import sys + + +if sys.version_info[0] == 2: import htmlentitydefs import urlparse import HTMLParser -try: # pragma: no cover - import urllib.request as urllib -except ImportError: # pragma: no cover import urllib +else: + import urllib.parse as urlparse + import html.entities as htmlentitydefs + import html.parser as HTMLParser + import urllib.request as urllib From b8415f83e7f26a907309e78a2dd343a7b9063199 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 21 Jun 2015 18:42:38 +0400 Subject: [PATCH 236/479] Bump version to make it ready to deploy release is on the way --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 72d7d0fa..305f66bc 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.0.00 +2015.6.21 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 699a7c97..9d76f089 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -29,7 +29,7 @@ skipwrap ) -__version__ = (2015, 6, 12) +__version__ = (2015, 6, 21) # TODO: From 4a936568516a6274f912c275827fe8d7d6bdb2d6 Mon Sep 17 00:00:00 2001 From: Etienne Millon Date: Mon, 29 Jun 2015 23:25:44 +0200 Subject: [PATCH 237/479] Make bodywidth kwarg overridable using config Default values for arguments are evaluated at import time. This makes a problem when using html2text the following way: import html2text # The default value is evaluated here html2text.config.BODY_WIDTH = 0 text = html2text.html2text(html) It is thus necessary to wait until the function runs to fetch the actual configuration value. --- AUTHORS.rst | 1 + ChangeLog.rst | 1 + html2text/__init__.py | 4 +++- test/test_html2text.py | 24 +++++++++++++++++++++--- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 7f5fbb8b..2c2b310a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -17,6 +17,7 @@ The AUTHORS/Contributors are (and/or have been): * Arjoonn Sharma * Ali Mohammad * Albert Berger +* Etienne Millon Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 305f66bc..17a7889d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -8,6 +8,7 @@ * Fix #61: Functionality added for optional use of automatic links. * Feature #80: ``title`` attribute is preserved in both inline and reference links. * Feature #82: More command line options. See docs. +* Fix #84: Make bodywidth kwarg overridable using config. 2015.6.12 diff --git a/html2text/__init__.py b/html2text/__init__.py index 9d76f089..9898224f 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -819,7 +819,9 @@ def optwrap(self, text): return result -def html2text(html, baseurl='', bodywidth=config.BODY_WIDTH): +def html2text(html, baseurl='', bodywidth=None): + if bodywidth is None: + bodywidth = config.BODY_WIDTH h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) return h.handle(html) diff --git a/test/test_html2text.py b/test/test_html2text.py index f6e45abf..414ce3bf 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -65,6 +65,13 @@ def test_command(fn, *args): return result, actual +def test_function(fn, **kwargs): + with open(fn) as inf: + actual = html2text.html2text(inf.read(), **kwargs) + result = get_baseline(fn) + return result, actual + + def get_dump_name(fn, suffix): return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix) @@ -98,8 +105,13 @@ def test_cmd(self): result, actual = test_command(fn, *cmdline_args) self.assertEqual(result, actual) + def test_func(self): + result, actual = test_function(fn, **func_args) + self.assertEqual(result, actual) + module_args = {} cmdline_args = [] + func_args = {} base_fn = os.path.basename(fn).lower() if base_fn.startswith('google'): @@ -126,6 +138,7 @@ def test_cmd(self): # module_args['unicode_snob'] = True module_args['body_width'] = 0 cmdline_args.append('--body-width=0') + func_args['bodywidth'] = 0 if base_fn.startswith('protect_links'): module_args['protect_links'] = True @@ -152,18 +165,23 @@ def test_cmd(self): if base_fn.startswith('mark_code'): module_args['mark_code'] = True cmdline_args.append('--mark-code') - - return test_mod, test_cmd + + if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: + test_func = None + + return test_mod, test_cmd, test_func # Originally from http://stackoverflow.com/questions/32899/\ # how-to-generate-dynamic-parametrized-unit-tests-in-python test_dir_name = os.path.dirname(os.path.realpath(__file__)) for fn in glob.glob("%s/*.html" % test_dir_name): test_name = 'test_%s' % os.path.splitext(os.path.basename(fn))[0].lower() - test_m, test_c = generate_test(fn) + test_m, test_c, test_func = generate_test(fn) setattr(TestHTML2Text, test_name + "_mod", test_m) if test_c: setattr(TestHTML2Text, test_name + "_cmd", test_c) + if test_func: + setattr(TestHTML2Text, test_name + "_func", test_func) if __name__ == "__main__": unittest.main() From 558db3755cf2509043ed3cc25645e0e9d55369b5 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sat, 4 Jul 2015 13:12:47 +0530 Subject: [PATCH 238/479] Added failing links --- test/long_link.html | 1 + test/long_link.md | 1 + 2 files changed, 2 insertions(+) create mode 100644 test/long_link.html create mode 100644 test/long_link.md diff --git a/test/long_link.html b/test/long_link.html new file mode 100644 index 00000000..650ad814 --- /dev/null +++ b/test/long_link.html @@ -0,0 +1 @@ +And here is a long link I had at hand.

      diff --git a/test/long_link.md b/test/long_link.md new file mode 100644 index 00000000..0917c36e --- /dev/null +++ b/test/long_link.md @@ -0,0 +1 @@ +And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-dpkg@lists.debian.org) is a long link I had at hand. From d20cbc25898f8f277046ae31536587623deb508b Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sat, 4 Jul 2015 13:56:55 +0530 Subject: [PATCH 239/479] Test passing for long wrap links. New option --no-wrap-links Basic link ignore added. --- html2text/__init__.py | 3 ++- html2text/cli.py | 8 ++++++++ html2text/config.py | 2 ++ html2text/utils.py | 8 +++++++- test/{long_link.html => no_wrap_links.html} | 0 test/{long_link.md => no_wrap_links.md} | 1 + test/test_html2text.py | 4 ++++ 7 files changed, 24 insertions(+), 2 deletions(-) rename test/{long_link.html => no_wrap_links.html} (100%) rename test/{long_link.md => no_wrap_links.md} (99%) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6f918936..63262d45 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -70,6 +70,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.strong_mark = '**' self.single_line_break = config.SINGLE_LINE_BREAK self.use_automatic_links = config.USE_AUTOMATIC_LINKS + self.wrap_links = config.WRAP_LINKS if out is None: self.out = self.outtextf @@ -775,7 +776,7 @@ def optwrap(self, text): newlines = 0 for para in text.split("\n"): if len(para) > 0: - if not skipwrap(para): + if not skipwrap(para, self.wrap_links): result += "\n".join(wrap(para, self.body_width)) if para.endswith(' '): result += " \n" diff --git a/html2text/cli.py b/html2text/cli.py index a3eb7da9..c1696560 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -10,6 +10,13 @@ def main(): p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) + p.add_option( + "--no-wrap-links", + dest="wrap_links", + action="store_false", + default=config.WRAP_LINKS, + help="wrap links during conversion" + ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", @@ -188,5 +195,6 @@ def main(): h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links + h.wrap_links = options.wrap_links wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index 07881fd9..8b3574ef 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -23,6 +23,8 @@ # Protect links from line breaks surrounding them with angle brackets (in # addition to their square brackets) PROTECT_LINKS = False +# WRAP_LINKS = True +WRAP_LINKS = True # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 diff --git a/html2text/utils.py b/html2text/utils.py index 1bc4102e..419c7843 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -172,7 +172,13 @@ def list_numbering_start(attrs): return 0 -def skipwrap(para): +def skipwrap(para, wrap_links=None): + if wrap_links is None: + wrap_links = config.WRAP_LINKS + # If it appears to contain a link + # don't wrap + if ('(' and '[' and ']' and ')' in para) and not wrap_links: + return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap if para[0:4] == ' ' or para[0] == '\t': diff --git a/test/long_link.html b/test/no_wrap_links.html similarity index 100% rename from test/long_link.html rename to test/no_wrap_links.html diff --git a/test/long_link.md b/test/no_wrap_links.md similarity index 99% rename from test/long_link.md rename to test/no_wrap_links.md index 0917c36e..34fbfbca 100644 --- a/test/long_link.md +++ b/test/no_wrap_links.md @@ -1 +1,2 @@ And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-dpkg@lists.debian.org) is a long link I had at hand. + diff --git a/test/test_html2text.py b/test/test_html2text.py index 60cd0a11..3f0369ed 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -148,6 +148,10 @@ def test_cmd(self): if base_fn.startswith('no_inline_links'): module_args['inline_links'] = False cmdline_args.append('--reference-links') + + if base_fn.startswith('no_wrap_links'): + module_args['wrap_links'] = False + cmdline_args.append('--no-wrap-links') return test_mod, test_cmd From 7becaef1c3486e64f26c4989fab19ca5d3513fcb Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sat, 4 Jul 2015 14:10:14 +0530 Subject: [PATCH 240/479] Switched to simple regex link checking. Reference links are not included for now. Updated docs. --- docs/usage.md | 1 + html2text/config.py | 1 + html2text/utils.py | 6 ++---- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 5e445630..feebc098 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -78,6 +78,7 @@ simple indications of their function. - RE_SLASH_CHARS a string of slash escapeable characters - RE_MD_BACKSLASH_MATCHER to match \char - USE_AUTOMATIC_LINKS to convert http://xyz to + - WRAP_LINKS to decide if links have to be wrapped during text wrapping To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. diff --git a/html2text/config.py b/html2text/config.py index 8b3574ef..541b2176 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -46,6 +46,7 @@ RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])") +RE_LINK = re.compile(r"\[.*?\] ?\(.*?\)") # to find links in the text RE_MD_DOT_MATCHER = re.compile(r""" ^ # start of line (\s*\d+) # optional whitespace and a number diff --git a/html2text/utils.py b/html2text/utils.py index 419c7843..43156dd1 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -172,12 +172,10 @@ def list_numbering_start(attrs): return 0 -def skipwrap(para, wrap_links=None): - if wrap_links is None: - wrap_links = config.WRAP_LINKS +def skipwrap(para, wrap_links): # If it appears to contain a link # don't wrap - if ('(' and '[' and ']' and ')' in para) and not wrap_links: + if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links: return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap From 59394a26e960b560bd5bf042616d4ff72e164c5e Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sat, 4 Jul 2015 14:23:19 +0530 Subject: [PATCH 241/479] Added support for reference links. --- html2text/config.py | 2 +- test/no_wrap_links_no_inline_links.html | 1 + test/no_wrap_links_no_inline_links.md | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 test/no_wrap_links_no_inline_links.html create mode 100644 test/no_wrap_links_no_inline_links.md diff --git a/html2text/config.py b/html2text/config.py index 541b2176..b4e2133c 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -46,7 +46,7 @@ RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])") -RE_LINK = re.compile(r"\[.*?\] ?\(.*?\)") # to find links in the text +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") # to find links in the text RE_MD_DOT_MATCHER = re.compile(r""" ^ # start of line (\s*\d+) # optional whitespace and a number diff --git a/test/no_wrap_links_no_inline_links.html b/test/no_wrap_links_no_inline_links.html new file mode 100644 index 00000000..c0c44eac --- /dev/null +++ b/test/no_wrap_links_no_inline_links.html @@ -0,0 +1 @@ +And here is a long link I had at hand. diff --git a/test/no_wrap_links_no_inline_links.md b/test/no_wrap_links_no_inline_links.md new file mode 100644 index 00000000..2e7f3e17 --- /dev/null +++ b/test/no_wrap_links_no_inline_links.md @@ -0,0 +1,2 @@ +And [here](http://bugs.debian.org/cgi-bin/pkgreport.cgi?tag=multiarch;users=debian-dpkg@lists.debian.org) is a long link I had at hand. + From f069b7074952814c9d34662f4fc2417275120c57 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Sun, 5 Jul 2015 10:29:01 +0530 Subject: [PATCH 242/479] `--no-wrap-links` implies `--reference-links` --- docs/usage.md | 2 +- html2text/__init__.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index feebc098..b6cf786c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -78,7 +78,7 @@ simple indications of their function. - RE_SLASH_CHARS a string of slash escapeable characters - RE_MD_BACKSLASH_MATCHER to match \char - USE_AUTOMATIC_LINKS to convert http://xyz to - - WRAP_LINKS to decide if links have to be wrapped during text wrapping + - WRAP_LINKS to decide if links have to be wrapped during text wrapping( implies INLINE_LINKS = False) To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. diff --git a/html2text/__init__.py b/html2text/__init__.py index 63262d45..2d8c0e30 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -774,6 +774,11 @@ def optwrap(self, text): assert wrap, "Requires Python 2.3." result = '' newlines = 0 + # I cannot think of a better solution for now. + # To avoid the non-wrap behaviour for entire paras + # because of the presence of a link in it + if not self.wrap_links: + self.inline_links = False for para in text.split("\n"): if len(para) > 0: if not skipwrap(para, self.wrap_links): From 8c58949e8de06e9e07a145844886dc98ac401d2b Mon Sep 17 00:00:00 2001 From: theSage21 Date: Mon, 6 Jul 2015 07:06:34 +0530 Subject: [PATCH 243/479] README update. Docs update --- README.md | 13 +------------ docs/usage.md | 1 + 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a4d79f2f..2c7254d0 100644 --- a/README.md +++ b/README.md @@ -15,27 +15,16 @@ html2text is a Python script that converts a page of HTML into clean, easy-to-re Usage: `html2text [(filename|url) [encoding]]` - | Option | Description |--------------------------------------------------------|--------------------------------------------------- | `--version` | Show program's version number and exit | `-h`, `--help` | Show this help message and exit | `--ignore-links` | Don't include any formatting for links -|`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets -|`--ignore-images` | Don't include any formatting for images -|`--images-to-alt` | Discard image data, only keep alt text -|`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions -|`-g`, `--google-doc` | Convert an html-exported Google Document -|`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items -|`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap -|`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists -|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. -| `--bypass-tables` | Format tables in HTML rather than Markdown syntax. -| `--single-line-break` | Use a single line break after a block element rather than two. | `--reference-links` | Use reference links instead of links to create markdown | `--mark-code` | Mark preformatted and code blocks with [code]...[/code] +For a complete list of options see the [docs](docs/usage.md) Or you can use it from within `Python`: diff --git a/docs/usage.md b/docs/usage.md index 15349c2f..a6069c41 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -115,3 +115,4 @@ Command line options | `--no-skip-internal-links` | Turn off skipping of internal links | `--links-after-para` | Put the links after the paragraph and not at end of document | `--mark-code` | Mark code with [code]...[/code] blocks +| `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` From e4af89e8b09b80d1d6fd5ca677d466d77d3e918d Mon Sep 17 00:00:00 2001 From: theSage21 Date: Mon, 6 Jul 2015 22:36:45 +0530 Subject: [PATCH 244/479] Added option via command line to control decode errors. --- html2text/cli.py | 10 +++++++++- html2text/config.py | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index 78b22da4..b8636367 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -173,6 +173,14 @@ def main(): default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]" ) + p.add_option( + "--decode-errors", + dest="decode_errors", + action="store", + type="str", + default=config.DECODE_ERRORS, + help="What to do in case of decode errors." + ) (options, args) = p.parse_args() # process input @@ -208,7 +216,7 @@ def main(): data = wrap_read() if hasattr(data, 'decode'): - data = data.decode(encoding) + data = data.decode(encoding, errors=options.decode_errors) h = HTML2Text(baseurl=baseurl) # handle options diff --git a/html2text/config.py b/html2text/config.py index 1c9a659a..dcd3f477 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -35,6 +35,7 @@ IMAGES_WITH_SIZE = False IGNORE_EMPHASIS = False MARK_CODE = False +DECODE_ERRORS = 'replace' # Convert links with same href and text to format if they are absolute links USE_AUTOMATIC_LINKS = True From c5df5384a3565274a8817a410ccc7abce5f352e6 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Mon, 6 Jul 2015 22:37:07 +0530 Subject: [PATCH 245/479] updated docs --- docs/usage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 15349c2f..8a08c432 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -115,3 +115,5 @@ Command line options | `--no-skip-internal-links` | Turn off skipping of internal links | `--links-after-para` | Put the links after the paragraph and not at end of document | `--mark-code` | Mark code with [code]...[/code] blocks +| `--no-wrap-links` | Do not wrap links. implies `--reference-links` +| `--decode_errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. From 4ba74bdc33454aa558958e41bebb79a584589f71 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Mon, 6 Jul 2015 22:37:21 +0530 Subject: [PATCH 246/479] Updated changelog Reverted default of `--decode-errors` to 'strict' to maintain backward compatibility. --- ChangeLog.rst | 1 + html2text/config.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index bef5fafe..c8066ded 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,6 +4,7 @@ * Fix #38: Long links wrapping controlled by `--no-wrap-links`. * Note: `--no-wrap-links` implies `--reference-links` +* Fix #87: Decode errors can be handled via command line. 2015.6.21 ========= diff --git a/html2text/config.py b/html2text/config.py index dcd3f477..85bf47dc 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -35,7 +35,7 @@ IMAGES_WITH_SIZE = False IGNORE_EMPHASIS = False MARK_CODE = False -DECODE_ERRORS = 'replace' +DECODE_ERRORS = 'strict' # Convert links with same href and text to format if they are absolute links USE_AUTOMATIC_LINKS = True From 5d3546b152323fb79f8f620a6348b6df388edbda Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 7 Jul 2015 07:28:16 +0530 Subject: [PATCH 247/479] Fix for python2.6 as it does not support keyword arguments for 'decode' --- html2text/cli.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index 609ce309..0f0be6b3 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -216,7 +216,11 @@ def main(): data = wrap_read() if hasattr(data, 'decode'): - data = data.decode(encoding, errors=options.decode_errors) + try: + data = data.decode(encoding, errors=options.decode_errors) + except TypeError: + # python 2.6.x does not have the errors option + data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options From a5f8e85ce3ff91abba8f73ac792310e34a0f0cef Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 7 Jul 2015 08:43:58 +0530 Subject: [PATCH 248/479] Added warning in UnicodeDecodeErrors and notified of --deocde-errors flag. --- html2text/cli.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index 0f0be6b3..d2d7884e 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -8,6 +8,16 @@ def main(): baseurl = '' + class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + p = optparse.OptionParser( '%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__)) @@ -217,10 +227,17 @@ def main(): if hasattr(data, 'decode'): try: - data = data.decode(encoding, errors=options.decode_errors) - except TypeError: - # python 2.6.x does not have the errors option - data = data.decode(encoding) + try: + data = data.decode(encoding, errors=options.decode_errors) + except TypeError: + # python 2.6.x does not have the errors option + data = data.decode(encoding) + except UnicodeDecodeError as err: + warning = bcolors.WARNING + "Warning:" + bcolors.ENDC + warning += ' Use the ' + bcolors.OKGREEN + warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' + print(warning) + raise err h = HTML2Text(baseurl=baseurl) # handle options From 919d12b7b0866418fae152419c6fb571b9a5d1a1 Mon Sep 17 00:00:00 2001 From: theSage21 Date: Tue, 7 Jul 2015 09:00:32 +0530 Subject: [PATCH 249/479] Coverage hack. Will write tests later on. --- html2text/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index d2d7884e..d765f59c 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -8,7 +8,7 @@ def main(): baseurl = '' - class bcolors: + class bcolors: # pragma: no cover HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' From 99e4a39982c8d04984f8d7bc90e6b404942bd519 Mon Sep 17 00:00:00 2001 From: arjoonn sharma Date: Tue, 13 Oct 2015 11:12:44 +0530 Subject: [PATCH 250/479] deocde errors spelling mistake --decode_errors to --decode-errors --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 00c70e04..675b3b6a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -117,4 +117,4 @@ Command line options | `--links-after-para` | Put the links after the paragraph and not at end of document | `--mark-code` | Mark code with [code]...[/code] blocks | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` -| `--decode_errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. +| `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. From 4df5923c538a67348f28810f0aea0c4166b716bc Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 18 Oct 2015 00:26:08 +0530 Subject: [PATCH 251/479] Remove the extra spaces on
      encounter as per #91 --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index a2dc5375..79e210c7 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -324,7 +324,7 @@ def handle_tag(self, tag, attrs, start): self.p() if tag == "br" and start: - self.o(" \n") + self.o("\n") if tag == "hr" and start: self.p() From f67b4d2fefc864c010eaabbe3156ce789181a393 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 1 Nov 2015 21:03:52 +0530 Subject: [PATCH 252/479] Possible fix for coveralls --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a744223e..c599f7fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: - "3.3" - "3.4" install: - - pip install coveralls + - pip install coveralls==0.5 before_script: - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' - export COVERAGE_PROCESS_START=$PWD/.coveragerc From 84e4bf2dd35e5ed14f5961b4d028a7af564c40d2 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 1 Nov 2015 21:08:56 +0530 Subject: [PATCH 253/479] possible fix for coveralls build --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a744223e..c599f7fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: - "3.3" - "3.4" install: - - pip install coveralls + - pip install coveralls==0.5 before_script: - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' - export COVERAGE_PROCESS_START=$PWD/.coveragerc From 10206ee2e89a4e092fa82bd209528cd520ebb703 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 1 Nov 2015 21:13:04 +0530 Subject: [PATCH 254/479] Rollback the br tag changes mistakenly inserted --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 79e210c7..a2dc5375 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -324,7 +324,7 @@ def handle_tag(self, tag, attrs, start): self.p() if tag == "br" and start: - self.o("\n") + self.o(" \n") if tag == "hr" and start: self.p() From 02378cb6434b5e300bd547ed5c676785adc0cf22 Mon Sep 17 00:00:00 2001 From: critiqjo Date: Wed, 30 Sep 2015 15:41:34 +0530 Subject: [PATCH 255/479] Add callback --- AUTHORS.rst | 1 + ChangeLog.rst | 7 +++++++ html2text/__init__.py | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 2c2b310a..39f7a983 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -18,6 +18,7 @@ The AUTHORS/Contributors are (and/or have been): * Ali Mohammad * Albert Berger * Etienne Millon +* John C F Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index ac195c54..ce22a06d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -6,6 +6,13 @@ * Note: `--no-wrap-links` implies `--reference-links` * Fix #87: Decode errors can be handled via command line. +2015.10.27 +========= +---- + +* Feature #83: Add callback-on-tag. + + 2015.6.21 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index d4f1bc93..af71ccd3 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -75,6 +75,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.single_line_break = config.SINGLE_LINE_BREAK self.use_automatic_links = config.USE_AUTOMATIC_LINKS self.wrap_links = config.WRAP_LINKS # covered in cli + self.tag_callback = None if out is None: # pragma: no cover self.out = self.outtextf @@ -281,6 +282,10 @@ def handle_tag(self, tag, attrs, start): else: attrs = dict(attrs) + if self.tag_callback is not None: + if self.tag_callback(self, tag, attrs, start) is True: + return + # first thing inside the anchor tag is another tag that produces some output if (start and not self.maybe_automatic_link is None and tag not in ['p', 'div', 'style', 'dl', 'dt'] From 7df6caf1de06106c1d275bea015f360b03e2aece Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 2 Nov 2015 00:13:31 +0400 Subject: [PATCH 256/479] Fix changeLog typos --- ChangeLog.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index ce22a06d..af6de862 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,11 +5,6 @@ * Fix #38: Long links wrapping controlled by `--no-wrap-links`. * Note: `--no-wrap-links` implies `--reference-links` * Fix #87: Decode errors can be handled via command line. - -2015.10.27 -========= ----- - * Feature #83: Add callback-on-tag. From 6e8926f4f5625bbd59ad30cd881740d618fa5b1b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 4 Nov 2015 14:39:52 +0000 Subject: [PATCH 257/479] Add .c9 to get ignored --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b1d87fe3..a7b8dd2f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dist .coverage .coverage.* env/ +.c9/ \ No newline at end of file From dd4263066380d7ae24b42540b0e48c76727f5a48 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 4 Nov 2015 14:45:18 +0000 Subject: [PATCH 258/479] Update the changelog with the latest changes --- ChangeLog.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index af6de862..e7d7bd3f 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,8 +4,10 @@ * Fix #38: Long links wrapping controlled by `--no-wrap-links`. * Note: `--no-wrap-links` implies `--reference-links` -* Fix #87: Decode errors can be handled via command line. * Feature #83: Add callback-on-tag. +* Fix #87: Decode errors can be handled via command line. +* Feature #95: Docs, decode errors spelling mistake. +* Fix #84: Make bodywidth kwarg overridable using config. 2015.6.21 @@ -18,7 +20,6 @@ * Fix #61: Functionality added for optional use of automatic links. * Feature #80: ``title`` attribute is preserved in both inline and reference links. * Feature #82: More command line options. See docs. -* Fix #84: Make bodywidth kwarg overridable using config. 2015.6.12 From 36be267c2dfa7bcef56fc30947e210cd1f63cd49 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 4 Nov 2015 14:47:26 +0000 Subject: [PATCH 259/479] Add more python version to travis and setup.py classifiers --- .travis.yml | 1 + setup.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c599f7fb..fec022bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.2" - "3.3" - "3.4" + - "3.5" install: - pip install coveralls==0.5 before_script: diff --git a/setup.py b/setup.py index 4dbbdf41..67661385 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,9 @@ def run(self): 'Programming Language :: Python :: 3.0', 'Programming Language :: Python :: 3.1', 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3' + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], entry_points=""" [console_scripts] From d62e3e36fee59682a02acc67e406012fc4186db7 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 4 Nov 2015 14:49:03 +0000 Subject: [PATCH 260/479] Bump version number --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index e7d7bd3f..c85c8d00 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.0.00 +2015.11.4 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index af71ccd3..80ea7159 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -29,7 +29,7 @@ skipwrap ) -__version__ = (2015, 6, 21) +__version__ = (2015, 11, 4) # TODO: From 9157181c09a7c630b633f41e0d108490aae8751f Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 4 Nov 2015 14:58:59 +0000 Subject: [PATCH 261/479] Remove python 3.5 support, due to #98 --- .travis.yml | 1 - setup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index fec022bd..c599f7fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,6 @@ python: - "3.2" - "3.3" - "3.4" - - "3.5" install: - pip install coveralls==0.5 before_script: diff --git a/setup.py b/setup.py index 67661385..d83ee613 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,6 @@ def run(self): 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', ], entry_points=""" [console_scripts] From f1b7a188d3e0d36dfc0c8379a01a7598319b047c Mon Sep 17 00:00:00 2001 From: Esperat Julian Date: Thu, 5 Nov 2015 11:32:06 +0100 Subject: [PATCH 262/479] Removed duplicated initialisation I kept the ones with the `# covered in cli` comment at the end. --- html2text/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 80ea7159..75afc2c3 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -72,8 +72,6 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE - self.single_line_break = config.SINGLE_LINE_BREAK - self.use_automatic_links = config.USE_AUTOMATIC_LINKS self.wrap_links = config.WRAP_LINKS # covered in cli self.tag_callback = None From 9ed7df736d92d8cb261c13e84bfff9f59978c876 Mon Sep 17 00:00:00 2001 From: Smite Chow Date: Fri, 6 Nov 2015 12:00:08 +0800 Subject: [PATCH 263/479] :bug: fix get element style key error --- html2text/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index a2a2d8c1..418c89cb 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -80,7 +80,7 @@ def element_style(attrs, style_def, parent_style): style = parent_style.copy() if 'class' in attrs: for css_class in attrs['class'].split(): - css_style = style_def['.' + css_class] + css_style = style_def.get('.' + css_class, {}) style.update(css_style) if 'style' in attrs: immediate_style = dumb_property_dict(attrs['style']) From f1ab1cedd87b2b1dbbfbc3ff742dd941ea87110b Mon Sep 17 00:00:00 2001 From: Smite Chow Date: Fri, 6 Nov 2015 14:04:41 +0800 Subject: [PATCH 264/479] :bug: fix error end tag pop exception --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 80ea7159..9c6baf72 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -306,7 +306,7 @@ def handle_tag(self, tag, attrs, start): tag_style = element_style(attrs, self.style_def, parent_style) self.tag_stack.append((tag, attrs, tag_style)) else: - dummy, attrs, tag_style = self.tag_stack.pop() + dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {}) if self.tag_stack: parent_style = self.tag_stack[-1][2] From b76fc98808f52109ec76b581005dbdac25c3e4f0 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 18 Dec 2015 10:56:26 +0530 Subject: [PATCH 265/479] Solved - spaces and "\xa0" are interchanged. - nbsp_unicode_md has "\xa0" --- test/nbsp_unicode.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/nbsp_unicode.md b/test/nbsp_unicode.md index aa100c2e..548b420d 100644 --- a/test/nbsp_unicode.md +++ b/test/nbsp_unicode.md @@ -3,12 +3,12 @@ In this test all NBSPs will be replaced with unicode non-breaking spaces (unicode_snob = True). -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod -tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, -quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore -eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt -in culpa qui officia deserunt mollit anim id est laborum. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore +eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt +in culpa qui officia deserunt mollit anim id est laborum. From 7a1aa4fbdb96821026fd4c0514c8c80d994ef305 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 31 Dec 2015 01:09:14 +0530 Subject: [PATCH 266/479] - Render as '~~text~~' in markdown - Rendering them as tags did not make sense. --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 00eae5b4..a93d0a37 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -365,9 +365,9 @@ def handle_tag(self, tag, attrs, start): self.o(self.strong_mark) if tag in ['del', 'strike', 's']: if start: - self.o("<" + tag + ">") + self.o("~~") else: - self.o("") + self.o("~~") if self.google_doc: if not self.inheader: From 6a82fdf021e2e48064e12d9e94024c18647fc6f1 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 31 Dec 2015 01:11:33 +0530 Subject: [PATCH 267/479] - Update changelog about new feature --- ChangeLog.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index c85c8d00..9c34f10b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,9 @@ +0000.00.00 +========= +---- + +* , , now rendered as ~~text~~ + 2015.11.4 ========= ---- From 0c2618d63d0dfa67b5621185a9011f0d501bc17e Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 31 Dec 2015 11:37:07 +0530 Subject: [PATCH 268/479] As per #103 - now rendered as ~~text~~ - Updated the Changelog - Modified test --- ChangeLog.rst | 6 ++++++ html2text/__init__.py | 4 ++-- test/decript_tage.md | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index c85c8d00..e47fc1f0 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,9 @@ +0000.00.00 +========= +---- + +* , , rendered as '~~text~~' + 2015.11.4 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 00eae5b4..89dd1fdd 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -365,9 +365,9 @@ def handle_tag(self, tag, attrs, start): self.o(self.strong_mark) if tag in ['del', 'strike', 's']: if start: - self.o("<" + tag + ">") + self.o('~~') else: - self.o("") + self.o('~~') if self.google_doc: if not self.inheader: diff --git a/test/decript_tage.md b/test/decript_tage.md index db18500d..eba0adac 100644 --- a/test/decript_tage.md +++ b/test/decript_tage.md @@ -1,2 +1,2 @@ -something something something +~~something~~ ~~something~~ ~~something~~ From f6803a0e1776f441b69918923a4237ae75ccc3de Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 31 Dec 2015 11:45:00 +0530 Subject: [PATCH 269/479] Merger from Alir3z4 --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index e47fc1f0..9c34f10b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,7 +2,7 @@ ========= ---- -* , , rendered as '~~text~~' +* , , now rendered as ~~text~~ 2015.11.4 ========= From 46079ef0e850a073c3940d454d7018bebed35480 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 31 Dec 2015 11:49:05 +0530 Subject: [PATCH 270/479] - fixed my mistake about the ' ' thing. - Refer to -   is converted to '/xa0' when unicode snob is on --- test/nbsp_unicode.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/nbsp_unicode.md b/test/nbsp_unicode.md index 548b420d..aa100c2e 100644 --- a/test/nbsp_unicode.md +++ b/test/nbsp_unicode.md @@ -3,12 +3,12 @@ In this test all NBSPs will be replaced with unicode non-breaking spaces (unicode_snob = True). -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod -tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, -quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore -eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt -in culpa qui officia deserunt mollit anim id est laborum. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore +eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt +in culpa qui officia deserunt mollit anim id est laborum. From 62be883fe67bff92dccce21fc8259e1f12148898 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 8 Jan 2016 16:36:11 +0000 Subject: [PATCH 271/479] Update change log with latest changes --- ChangeLog.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 9c34f10b..7ed5c52b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,8 +1,12 @@ -0000.00.00 +2016.1.8 ========= ---- -* , , now rendered as ~~text~~ +* Feature #99: Removed duplicated initialisation. +* Fix #100: Get element style key error. +* Fix #101: Fix error end tag pop exception +* , , now rendered as ~~text~~. + 2015.11.4 ========= From 0c70c8f06354f9ed4d3fa4c316e2953ca847f2fa Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 8 Jan 2016 16:39:49 +0000 Subject: [PATCH 272/479] bump version --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 89dd1fdd..97f5d302 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -29,7 +29,7 @@ skipwrap ) -__version__ = (2015, 11, 4) +__version__ = (2016, 1, 8) # TODO: From 4c1b81ef20a6dfe94b1149f512de84b2b4d78c23 Mon Sep 17 00:00:00 2001 From: Stefano Rivera Date: Wed, 13 Jan 2016 16:14:10 -0800 Subject: [PATCH 273/479] Support Python 3.5 (convert_charrefs=False) We can't use the new default, until we stop supporting all versions of Python that don't accept this argument (it appeared in 3.4) Fixes: #89 --- .travis.yml | 1 + html2text/__init__.py | 6 +++++- setup.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c599f7fb..fec022bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.2" - "3.3" - "3.4" + - "3.5" install: - pip install coveralls==0.5 before_script: diff --git a/html2text/__init__.py b/html2text/__init__.py index 97f5d302..4d856b54 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -3,6 +3,7 @@ """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division import re +import sys import cgi try: @@ -44,7 +45,10 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): appends lines of text). baseurl: base URL of the document we process """ - HTMLParser.HTMLParser.__init__(self) + kwargs = {} + if sys.version_info >= (3, 4): + kwargs['convert_charrefs'] = False + HTMLParser.HTMLParser.__init__(self, **kwargs) # Config options self.split_next_td = False diff --git a/setup.py b/setup.py index d83ee613..67661385 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,7 @@ def run(self): 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], entry_points=""" [console_scripts] From 43b2677fce052c7e4054f4f3f982961be65c23cf Mon Sep 17 00:00:00 2001 From: Stefano Rivera Date: Wed, 13 Jan 2016 17:28:43 -0800 Subject: [PATCH 274/479] ChangeLog entry --- ChangeLog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 7ed5c52b..9fea6900 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +2016.1.13 +========= +---- + +* Fix #89: Python 3.5 support. + + 2016.1.8 ========= ---- From f5c289b8eec5b1efed9faeba3f3422eddcf313d9 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Thu, 14 Jan 2016 14:09:21 +0100 Subject: [PATCH 275/479] Fix typos in docs --- docs/about.md | 2 +- docs/how_it_works.md | 8 ++++---- docs/usage.md | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/about.md b/docs/about.md index 72d50639..04092dfb 100644 --- a/docs/about.md +++ b/docs/about.md @@ -5,7 +5,7 @@ html2text is a python package which converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). -It was origionally written by Aaron Swartz. +It was originally written by Aaron Swartz. The code is under GPL v3. diff --git a/docs/how_it_works.md b/docs/how_it_works.md index c06430b3..4e6891ed 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -2,7 +2,7 @@ Introduction ============ -There are 5 components to the code. They are kept as seperate files in the +There are 5 components to the code. They are kept as separate files in the html2text directory. This part of the documentation explains them bit by bit. @@ -31,7 +31,7 @@ Used to provide various configuration settings to the converter. They are as fol - IGNORE_EMPHASIS - BYPASS_TABLES - SINGLE_LINE_BREAK to use a single line break rather than two - - UNIFIABLE is a dictionary which maps unicode abbrevations to ASCII + - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII values - RE_SPACE for finding space-only lines - RE_UNESCAPE for finding html entities like   @@ -67,7 +67,7 @@ Some functions are: - wrapwrite :write to buffer - wrap_read :read from buffer - escape_md :escape md sensitive within other md - - escape_md_section :escape md sensitive accrose whole doc + - escape_md_section :escape md sensitive across whole doc cli.py @@ -89,7 +89,7 @@ Command line interface for the code. |`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items |`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap |`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists -|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well +|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevant when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. | `--single-line-break` | Use a single line break after a block element rather than two. diff --git a/docs/usage.md b/docs/usage.md index 675b3b6a..63112b2c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -64,7 +64,7 @@ simple indications of their function. - IGNORE_EMPHASIS - BYPASS_TABLES - SINGLE_LINE_BREAK to use a single line break rather than two - - UNIFIABLE is a dictionary which maps unicode abbrevations to ASCII + - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII values - RE_SPACE for finding space-only lines - RE_UNESCAPE for finding html entities like   @@ -104,7 +104,7 @@ Command line options |`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items |`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap |`-i` `LIST_INDENT`, `--google-list-indent`=`LIST_INDENT`| Number of pixels Google indents nested lists -|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevent when `-g` is specified as well +|`-s`, `--hide-strikethrough` | Hide strike-through text. only relevant when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. | `--single-line-break` | Use a single line break after a block element rather than two. @@ -112,7 +112,7 @@ Command line options | `--ignore-emphasis` | Ignore all emphasis formatting in the html. | `-e`, `--asterisk-emphasis` | Use asterisk rather than underscore to emphasize text | `--unicode-snob` | Use unicode throughout instead of ASCII -| `--no-automatic-links` | Do not use automatic links like +| `--no-automatic-links` | Do not use automatic links like | `--no-skip-internal-links` | Turn off skipping of internal links | `--links-after-para` | Put the links after the paragraph and not at end of document | `--mark-code` | Mark code with [code]...[/code] blocks From 6ff7d28206778b922dc9384d0f0c4657d6b8a8d5 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 14 Jan 2016 18:32:08 +0400 Subject: [PATCH 276/479] Make the next version vanilla until its time comes --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 9fea6900..3e90c1bd 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -2016.1.13 +0000.0.0 ========= ---- From 2116e83262c590aafb28ac1c73c4a4f2012cd909 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 15 Jan 2016 08:11:35 +0400 Subject: [PATCH 277/479] Some tiny reformatting on Usage.md * Faster git clone * Fix setup.py typo --- docs/usage.md | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 63112b2c..8e34d409 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,11 +8,24 @@ Installing These are the methods you can get the module installed:- - 1. `pip install html2text` for those who have pip - 2. Clone the repository from `https://github.com/Alir3z4/html2text.git` - 1. `git clone https://github.com/Alir3z4/html2text` - 2. `python setup build` - 3. `python setup install` +### PIP + +For those who have pip, we got your back. + +``` +$ pip install html2text +``` + +### Clone from Git Repository + +Clone the repository from https://github.com/Alir3z4/html2text + +``` +$ git clone --depth 1 https://github.com/Alir3z4/html2text.git +$ python setup.py build +$ python setup.py install +``` + Basic Usage From 11b026ce4bfbb88488b7e2f74f660fe8dd195344 Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Thu, 28 Jan 2016 16:59:31 +0300 Subject: [PATCH 278/479] add baseurl functionality to inplace urls --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 4d856b54..a1d2998c 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -420,9 +420,9 @@ def handle_tag(self, tag, attrs, start): try: title = escape_md(a['title']) except KeyError: - self.o("](" + escape_md(a['href']) + ")") + self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")") else: - self.o("](" + escape_md(a['href']) + self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ' "' + title + '" )') else: i = self.previousIndex(a) From e2ed656a820343be508c5983bb6b43e8763b42cd Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Sun, 31 Jan 2016 14:34:00 +0300 Subject: [PATCH 279/479] add baseurl functionality to inplace urls to tags also --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index a1d2998c..648b8eac 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -476,7 +476,7 @@ def handle_tag(self, tag, attrs, start): self.o("![" + escape_md(alt) + "]") if self.inline_links: href = attrs.get('href') or '' - self.o("(" + escape_md(href) + ")") + self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")") else: i = self.previousIndex(attrs) if i is not None: From e001365018f2520200dab269cef121ef457eb908 Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Sun, 31 Jan 2016 14:38:04 +0300 Subject: [PATCH 280/479] add inplace baseurl substitution test for tag --- test/inplace_baseurl_substitution.html | 8 ++++++++ test/inplace_baseurl_substitution.md | 1 + test/test_html2text.py | 6 ++++++ 3 files changed, 15 insertions(+) create mode 100644 test/inplace_baseurl_substitution.html create mode 100644 test/inplace_baseurl_substitution.md diff --git a/test/inplace_baseurl_substitution.html b/test/inplace_baseurl_substitution.html new file mode 100644 index 00000000..c47a7ba4 --- /dev/null +++ b/test/inplace_baseurl_substitution.html @@ -0,0 +1,8 @@ + + + +

      +read2text header image +

      + + diff --git a/test/inplace_baseurl_substitution.md b/test/inplace_baseurl_substitution.md new file mode 100644 index 00000000..98f06b79 --- /dev/null +++ b/test/inplace_baseurl_substitution.md @@ -0,0 +1 @@ +![read2text header image](http://brettterpstra.com/uploads/2012/01/read2textheader.jpg) diff --git a/test/test_html2text.py b/test/test_html2text.py index 052a0694..2d678ee0 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -173,6 +173,12 @@ def test_func(self): if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: test_func = None + if base_fn == 'inplace_baseurl_substitution.html': + module_args['baseurl'] = 'http://brettterpstra.com' + module_args['body_width'] = 0 + # there is no way to specify baseurl in cli :( + test_cmd = None + return test_mod, test_cmd, test_func # Originally from http://stackoverflow.com/questions/32899/\ From 2ed52f3a1eec0635fa6d29415374b7beb81a7253 Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Sun, 31 Jan 2016 14:42:51 +0300 Subject: [PATCH 281/479] add inplace baseurl substitution test for tag --- test/inplace_baseurl_substitution.html | 3 +++ test/inplace_baseurl_substitution.md | 2 ++ 2 files changed, 5 insertions(+) diff --git a/test/inplace_baseurl_substitution.html b/test/inplace_baseurl_substitution.html index c47a7ba4..cb55345d 100644 --- a/test/inplace_baseurl_substitution.html +++ b/test/inplace_baseurl_substitution.html @@ -4,5 +4,8 @@

      read2text header image

      +

      +BrettTerpstra.com +

      diff --git a/test/inplace_baseurl_substitution.md b/test/inplace_baseurl_substitution.md index 98f06b79..bc73c977 100644 --- a/test/inplace_baseurl_substitution.md +++ b/test/inplace_baseurl_substitution.md @@ -1 +1,3 @@ ![read2text header image](http://brettterpstra.com/uploads/2012/01/read2textheader.jpg) + +[BrettTerpstra.com](http://brettterpstra.com/) From fbe1b769a09773eb1c7a44be3470350ddd69d759 Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Sun, 31 Jan 2016 14:50:39 +0300 Subject: [PATCH 282/479] update changelog.rst and authors.rst --- AUTHORS.rst | 1 + ChangeLog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 39f7a983..93c19380 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -19,6 +19,7 @@ The AUTHORS/Contributors are (and/or have been): * Albert Berger * Etienne Millon * John C F +* Mikhail Melnik Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 3e90c1bd..06eccb0f 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ ---- * Fix #89: Python 3.5 support. +* Fix #113: inplace baseurl substitution for and tags 2016.1.8 From 7cabac988d830de769fc234bcd933cb9608713f1 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 12 Feb 2016 23:41:16 +0400 Subject: [PATCH 283/479] Update the badges to badge.kloud51.com Thanks @SavandBros --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2c7254d0..c5b90e71 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@ [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) -[![Downloads](https://pypip.in/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Version](https://pypip.in/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Egg?](https://pypip.in/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Wheel?](https://pypip.in/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Format](https://pypip.in/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![License](https://pypip.in/license/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Downloads](http://badge.kloud51.com/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Version](http://badge.kloud51.com/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Egg?](http://badge.kloud51.com/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Wheel?](http://badge.kloud51.com/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Format](http://badge.kloud51.com/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![License](http://badge.kloud51.com/license/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From cc04bddfd4829ca164282341716da09a401ecd4f Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 14 Feb 2016 21:14:28 +0400 Subject: [PATCH 284/479] Fix with the latest changes on the endpoint --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c5b90e71..2bc93445 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,11 @@ [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) -[![Downloads](http://badge.kloud51.com/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Version](http://badge.kloud51.com/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Egg?](http://badge.kloud51.com/egg/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Wheel?](http://badge.kloud51.com/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Format](http://badge.kloud51.com/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![License](http://badge.kloud51.com/license/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Downloads](http://badge.kloud51.com/pypi/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Version](http://badge.kloud51.com/pypi/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Wheel?](http://badge.kloud51.com/pypi/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Format](http://badge.kloud51.com/pypi/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![License](http://badge.kloud51.com/pypi/license/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From 8bdd5a4f6e512458aa47b0132aac232cfa3c4729 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 14 Feb 2016 21:15:50 +0400 Subject: [PATCH 285/479] Update the changelog with badges updates --- ChangeLog.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 06eccb0f..462101de 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,7 +3,8 @@ ---- * Fix #89: Python 3.5 support. -* Fix #113: inplace baseurl substitution for and tags +* Fix #113: inplace baseurl substitution for and tags. +* Feature #118: Update the badges to badge.kloud51.com 2016.1.8 From 1b46d03babfe921405cbba262420207c0f3d9863 Mon Sep 17 00:00:00 2001 From: arjoonn sharma Date: Thu, 18 Feb 2016 08:42:17 +0530 Subject: [PATCH 286/479] Create ISSUE_TEMPLATE --- ISSUE_TEMPLATE | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 ISSUE_TEMPLATE diff --git a/ISSUE_TEMPLATE b/ISSUE_TEMPLATE new file mode 100644 index 00000000..252e4274 --- /dev/null +++ b/ISSUE_TEMPLATE @@ -0,0 +1,3 @@ + +- Version by `html2text --version` +- Test script From 0711f0c3caf83790b8a029fae2a7bedbebac84be Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 25 Feb 2016 11:05:14 +0530 Subject: [PATCH 287/479] - Added failing tests --- test/text_after_list.html | 2 ++ test/text_after_list.md | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 test/text_after_list.html create mode 100644 test/text_after_list.md diff --git a/test/text_after_list.html b/test/text_after_list.html new file mode 100644 index 00000000..8691b4ff --- /dev/null +++ b/test/text_after_list.html @@ -0,0 +1,2 @@ +
      • item
      +text diff --git a/test/text_after_list.md b/test/text_after_list.md new file mode 100644 index 00000000..66d9d984 --- /dev/null +++ b/test/text_after_list.md @@ -0,0 +1,4 @@ + * item + +text + From 61f627919fa29856c507ffbcbb8433248380dfa7 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 25 Feb 2016 12:45:51 +0530 Subject: [PATCH 288/479] - Added fix for #119 - Updated tests where failure was expected --- html2text/__init__.py | 2 ++ test/list_tags_example.md | 2 ++ test/url-escaping.html | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 648b8eac..75e0a189 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -514,6 +514,8 @@ def handle_tag(self, tag, attrs, start): else: if self.list: self.list.pop() + if (not self.google_doc) and (not self.list): + self.o('\n') self.lastWasList = True else: self.lastWasList = False diff --git a/test/list_tags_example.md b/test/list_tags_example.md index b9723699..151249d6 100644 --- a/test/list_tags_example.md +++ b/test/list_tags_example.md @@ -28,9 +28,11 @@ Drums * some item * Some other item * some item + 1. Some other item 2. some item 3. some item + * somthing else here * some item diff --git a/test/url-escaping.html b/test/url-escaping.html index c4f68b9c..5b03e697 100644 --- a/test/url-escaping.html +++ b/test/url-escaping.html @@ -6,8 +6,8 @@

      Markdown-sensible characters processing

    2. Some MSDN link using parenthesis
    3. Google search result URL with unescaped brackets
    4. Yet another test for [brackets], {curly braces} and (parenthesis) processing inside the anchor
    5. -
    6. Use automatic links like http://example.com/ when the URL is the label -
    7. Exempt non-absolute_URIs from automatic link detection +
    8. Use automatic links like http://example.com/ when the URL is the label
    9. +
    10. Exempt non-absolute_URIs from automatic link detection
    11. And here are images with tricky attribute values:

      From ca4fcbe7628bfb2cc2281bfaa5c2e3521ca4381d Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 25 Feb 2016 12:48:15 +0530 Subject: [PATCH 289/479] Update changelog --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 462101de..db4e491f 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,6 +5,7 @@ * Fix #89: Python 3.5 support. * Fix #113: inplace baseurl substitution for and tags. * Feature #118: Update the badges to badge.kloud51.com +* Fix #119: new-line after a list is inserted 2016.1.8 From 9659037f230c25ad51c80feb7412406ef11672e5 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Thu, 25 Feb 2016 13:08:08 +0530 Subject: [PATCH 290/479] - Added pypandoc to convert MD to rst while pushing to pypi. - added a requirements file for development --- requirements-dev.txt | 4 ++++ setup.py | 7 +++++++ 2 files changed, 11 insertions(+) create mode 100644 requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..a5b00a3f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +coverage==4.0.3 +py==1.4.31 +pypandoc==1.1.3 +wheel==0.24.0 diff --git a/setup.py b/setup.py index 67661385..d38caa51 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,12 @@ # coding: utf-8 import sys from setuptools import setup, Command, find_packages +try: + from pypandoc import convert + read_md = lambda f: convert(f, 'rst') +except ImportError: + print("warning: pypandoc module not found, could not convert Markdown to RST") + read_md = lambda f: open(f, 'r').read() requires_list = [] try: @@ -36,6 +42,7 @@ def run(self): name="html2text", version=".".join(map(str, __import__('html2text').__version__)), description="Turn HTML into equivalent Markdown-structured text.", + long_description=read_md('README.md'), author="Aaron Swartz", author_email="me@aaronsw.com", maintainer='Alireza Savand', From d3f91983b4e9e3bd59b4478f6110b1190533e859 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 26 Feb 2016 14:59:28 +0530 Subject: [PATCH 291/479] - Removed print from setup.py --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index d38caa51..f79e45d8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ from pypandoc import convert read_md = lambda f: convert(f, 'rst') except ImportError: - print("warning: pypandoc module not found, could not convert Markdown to RST") read_md = lambda f: open(f, 'r').read() requires_list = [] From 1a72117222ae5df40c4249d0551b20c04ff6a12d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sat, 27 Feb 2016 00:01:20 +0400 Subject: [PATCH 292/479] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2bc93445..20967f47 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) -[![Downloads](http://badge.kloud51.com/pypi/d/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Version](http://badge.kloud51.com/pypi/v/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Wheel?](http://badge.kloud51.com/pypi/wheel/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![Format](http://badge.kloud51.com/pypi/format/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) -[![License](http://badge.kloud51.com/pypi/license/html2text/badge.png)](https://pypi.python.org/pypi/html2text/) +[![Downloads](http://badge.kloud51.com/pypi/d/html2text.png)](https://pypi.python.org/pypi/html2text/) +[![Version](http://badge.kloud51.com/pypi/v/html2text.png)](https://pypi.python.org/pypi/html2text/) +[![Wheel?](http://badge.kloud51.com/pypi/wheel/html2text.png)](https://pypi.python.org/pypi/html2text/) +[![Format](http://badge.kloud51.com/pypi/format/html2text.png)](https://pypi.python.org/pypi/html2text/) +[![License](http://badge.kloud51.com/pypi/license/html2text.png)](https://pypi.python.org/pypi/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). From b7a78b87d926da8b1e1fdd74c1b6a1e9b5c7d428 Mon Sep 17 00:00:00 2001 From: arjoonn sharma Date: Mon, 7 Mar 2016 00:44:25 +0530 Subject: [PATCH 293/479] Update ISSUE_TEMPLATE Add python version to the template as mentioned in --- ISSUE_TEMPLATE | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ISSUE_TEMPLATE b/ISSUE_TEMPLATE index 252e4274..f2d46a02 100644 --- a/ISSUE_TEMPLATE +++ b/ISSUE_TEMPLATE @@ -1,3 +1,4 @@ - + - Version by `html2text --version` - Test script +- Python version `python --version` From 8274e9dc9400da7f02d29024ba3012fd94c5ba79 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Mon, 7 Mar 2016 10:26:27 +0530 Subject: [PATCH 294/479] - Fix for #106 - tests to follow --- html2text/cli.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index d765f59c..d9339289 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -195,12 +195,13 @@ class bcolors: # pragma: no cover # process input encoding = "utf-8" + if len(args) == 2: + encoding = args[1] + elif len(args) > 2: + p.error('Too many arguments') + if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] - if len(args) == 2: - encoding = args[1] - if len(args) > 2: - p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ From 82aec92b018a9ebe493960b6e77e3f600fe1dc80 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 1 Apr 2016 22:48:18 +0530 Subject: [PATCH 295/479] Add to changelog --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index db4e491f..5bdb3e61 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,6 +2,7 @@ ========= ---- +* Fix #106: encoding by stdin * Fix #89: Python 3.5 support. * Fix #113: inplace baseurl substitution for and tags. * Feature #118: Update the badges to badge.kloud51.com From cff6d96389521c3d117895e694e881e5bc9b1672 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 1 Apr 2016 21:59:50 +0000 Subject: [PATCH 296/479] Bump version 2016.4.2 --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 5bdb3e61..2f8acb73 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.0.0 +2016.4.2 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 75e0a189..e7b88be3 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -30,7 +30,7 @@ skipwrap ) -__version__ = (2016, 1, 8) +__version__ = (2016, 4, 2) # TODO: From ebab5bc0204467059abe4babdbec7c464169c076 Mon Sep 17 00:00:00 2001 From: Amit Upadhyay Date: Fri, 27 May 2016 18:22:22 +0530 Subject: [PATCH 297/479] do not break words to wrap paragraph (#128) * do not break words to wrap paragraph python's textwrap.wrap() supports break_long_words keyword argument for this purpose. closes # 127 * not overshooting line length --- html2text/__init__.py | 4 +++- test/long_lines.html | 1 + test/long_lines.md | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 test/long_lines.html create mode 100644 test/long_lines.md diff --git a/html2text/__init__.py b/html2text/__init__.py index e7b88be3..a45f5011 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -814,7 +814,9 @@ def optwrap(self, text): for para in text.split("\n"): if len(para) > 0: if not skipwrap(para, self.wrap_links): - result += "\n".join(wrap(para, self.body_width)) + result += "\n".join( + wrap(para, self.body_width, break_long_words=False) + ) if para.endswith(' '): result += " \n" newlines = 1 diff --git a/test/long_lines.html b/test/long_lines.html new file mode 100644 index 00000000..7389de91 --- /dev/null +++ b/test/long_lines.html @@ -0,0 +1 @@ +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd diff --git a/test/long_lines.md b/test/long_lines.md new file mode 100644 index 00000000..65363ae7 --- /dev/null +++ b/test/long_lines.md @@ -0,0 +1,14 @@ +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd +![](http://www.foooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo.com) +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd asd +asd asd asd asd asd + From 5672e2dd2c5bf918275acaf1142b36beba1b9ac5 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 27 May 2016 18:26:10 +0530 Subject: [PATCH 298/479] Deprecation warning for retrieval over internet --- html2text/cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/html2text/cli.py b/html2text/cli.py index d9339289..770f388d 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,4 +1,5 @@ import optparse +import warnings from html2text.compat import urllib from html2text import HTML2Text, config, __version__ @@ -204,6 +205,8 @@ class bcolors: # pragma: no cover file_ = args[0] if file_.startswith('http://') or file_.startswith('https://'): + warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)", + DeprecationWarning) baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() From 7c8c5aecaba55d8667420486d41f2c807b7fddd6 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 27 May 2016 20:08:27 +0530 Subject: [PATCH 299/479] Added failing tests for #114 --- test/break_preserved_in_blockquote.html | 1 + test/break_preserved_in_blockquote.md | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 test/break_preserved_in_blockquote.html create mode 100644 test/break_preserved_in_blockquote.md diff --git a/test/break_preserved_in_blockquote.html b/test/break_preserved_in_blockquote.html new file mode 100644 index 00000000..2436ed0e --- /dev/null +++ b/test/break_preserved_in_blockquote.html @@ -0,0 +1 @@ +a
      b
      c
      diff --git a/test/break_preserved_in_blockquote.md b/test/break_preserved_in_blockquote.md new file mode 100644 index 00000000..f8a27d83 --- /dev/null +++ b/test/break_preserved_in_blockquote.md @@ -0,0 +1,3 @@ +a +> b +> c From 210869d2628bf1b4371f695e62f333f5c0dda3ee Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 27 May 2016 20:45:10 +0530 Subject: [PATCH 300/479] Fixed tests --- test/break_preserved_in_blockquote.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/break_preserved_in_blockquote.md b/test/break_preserved_in_blockquote.md index f8a27d83..f3b94c2a 100644 --- a/test/break_preserved_in_blockquote.md +++ b/test/break_preserved_in_blockquote.md @@ -1,3 +1,5 @@ a -> b + +> b > c + From 8ccd470751d8b1d221021163230bd20cdcb043ba Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 27 May 2016 20:46:02 +0530 Subject: [PATCH 301/479] - Pull out common try:except for unicode characters to avoid repetition - Breaks preserve blockquotes --- html2text/__init__.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index a45f5011..4ace76b0 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -142,23 +142,20 @@ def close(self): try: nochr = unicode('') + unicode_character = unichr except NameError: nochr = str('') + unicode_character = chr self.pbr() self.o('', 0, 'end') outtext = nochr.join(self.outtextlist) + if self.unicode_snob: - try: - nbsp = unichr(name2cp('nbsp')) - except NameError: - nbsp = chr(name2cp('nbsp')) + nbsp = unicode_character(name2cp('nbsp')) else: - try: - nbsp = unichr(32) - except NameError: - nbsp = chr(32) + nbsp = unicode_character(32) try: outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: @@ -331,7 +328,10 @@ def handle_tag(self, tag, attrs, start): self.p() if tag == "br" and start: - self.o(" \n") + if self.blockquote > 0: + self.o(" \n> ") + else: + self.o(" \n") if tag == "hr" and start: self.p() From 39502775d29bf518ee079c9aa44cd390c0ccf855 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Fri, 27 May 2016 20:54:23 +0530 Subject: [PATCH 302/479] Changelog update --- ChangeLog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 2f8acb73..fb9d99a0 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +xxxx.x.x +========= +---- + +* Fix #114: Break does not interrupt blockquotes +* Deprecation warnings for URL retreival + 2016.4.2 ========= ---- From 7569dfbdc67c37d78ff936fc31f39b2b64ecccee Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 13:45:49 +0530 Subject: [PATCH 303/479] Pad tables CLI option --- html2text/cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/html2text/cli.py b/html2text/cli.py index 770f388d..7ece36db 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -23,6 +23,13 @@ class bcolors: # pragma: no cover '%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__)) ) + p.add_option( + "--pad_tables", + dest="pad_tables", + action="store_true", + default=config.PAD_TABLES, + help="pad the cells to equal column width in tables" + ) p.add_option( "--no-wrap-links", dest="wrap_links", @@ -271,5 +278,6 @@ class bcolors: # pragma: no cover h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code h.wrap_links = options.wrap_links + h.pad_tables = options.pad_tables wrapwrite(h.handle(data)) From e714f1f19dd6fe59c2deaffb0519b39e688c3e59 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 13:47:45 +0530 Subject: [PATCH 304/479] Pad tables option in parser --- html2text/__init__.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 4ace76b0..1ac6dbac 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -27,7 +27,8 @@ list_numbering_start, dumb_css_parser, escape_md_section, - skipwrap + skipwrap, + pad_tables_in_text ) __version__ = (2016, 4, 2) @@ -77,6 +78,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE self.wrap_links = config.WRAP_LINKS # covered in cli + self.pad_tables = config.PAD_TABLES # covered in cli self.tag_callback = None if out is None: # pragma: no cover @@ -130,7 +132,11 @@ def feed(self, data): def handle(self, data): self.feed(data) self.feed("") - return self.optwrap(self.close()) + markdown = self.optwrap(self.close()) + if self.pad_tables: + return pad_tables_in_text(markdown) + else: + return markdown def outtextf(self, s): self.outtextlist.append(s) @@ -556,8 +562,12 @@ def handle_tag(self, tag, attrs, start): self.o(''.format(tag)) else: - if tag == "table" and start: - self.table_start = True + if tag == "table": + if start: + self.table_start = True + else: + if self.pad_tables: + self.o(" \n") if tag in ["td", "th"] and start: if self.split_next_td: self.o("| ") From b68c9bbf63d46c543f5128cc8cdfab9a1ded1cb8 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 13:48:04 +0530 Subject: [PATCH 305/479] Pad table utility functions --- html2text/utils.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/html2text/utils.py b/html2text/utils.py index 418c89cb..726e5e8d 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -244,3 +244,51 @@ def escape_md_section(text, snob=False): text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) return text + +def reformat_table(lines, right_margin): + """ + Given the lines of a table + padds the cells and returns the new lines + """ + # find the maximum width of the columns + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] + for line in lines: + cols = [x.rstrip() for x in line.split('|')] + max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)] + + # reformat + new_lines = [] + for line in lines: + cols = [x.rstrip() for x in line.split('|')] + if set(line.strip()) == {'-', '|'}: + filler = '-' + new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width)] + else: + filler = ' ' + new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width)] + new_lines.append('|'.join(new_cols)) + return new_lines + +def pad_tables_in_text(text, right_margin=1): + """ + Provide padding for tables in the text + """ + lines = text.split('\n') + table_buffer, altered_lines, table_widths, within_table = [], [], [], False + new_lines = [] + for line in lines: + if '|' in line: + if not within_table: + within_table = True + table_buffer.append(line) + else: + if within_table: + table = reformat_table(table_buffer, right_margin) + new_lines.extend(table) + table_buffer = [] + within_table = False + new_lines.append(line) + new_text = '\n'.join(new_lines) + return new_text From b4b7634732dc1346cffbc007a2692b0c0fac4e91 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 13:48:27 +0530 Subject: [PATCH 306/479] PAD_TABLES default is off --- html2text/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/html2text/config.py b/html2text/config.py index 85bf47dc..979dd3b0 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -36,6 +36,7 @@ IGNORE_EMPHASIS = False MARK_CODE = False DECODE_ERRORS = 'strict' +PAD_TABLES = False # Convert links with same href and text to format if they are absolute links USE_AUTOMATIC_LINKS = True @@ -43,6 +44,7 @@ # For checking space-only lines on line 771 RE_SPACE = re.compile(r'\s\+') +RE_TABLE_HEADER_MARKER = re.compile(r'(.+\|?)\n(---\|?)+') RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s') RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') From 442c65ddf12926888998b4f1f3b3904587a3ded2 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 13:56:34 +0530 Subject: [PATCH 307/479] added tests --- test/pad_table.html | 24 ++++++++++++++++++++++++ test/pad_table.md | 26 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 test/pad_table.html create mode 100644 test/pad_table.md diff --git a/test/pad_table.html b/test/pad_table.html new file mode 100644 index 00000000..7be8f833 --- /dev/null +++ b/test/pad_table.html @@ -0,0 +1,24 @@ + + +

      This is a test document

      With some text, code, bolds and italics.

      This is second header

      Displaynone text

      + + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      + + + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      + +some content within the table
      + + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      + +something else entirely + diff --git a/test/pad_table.md b/test/pad_table.md new file mode 100644 index 00000000..d19a1b11 --- /dev/null +++ b/test/pad_table.md @@ -0,0 +1,26 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 | Header 2 | Header 3 +----------|-----------|---------------------------------------------- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! + +Header 1 | Header 2 | Header 3 +----------|-----------|---------------------------------------------- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! + +some content within the table +Header 1 | Header 2 | Header 3 +----------|-----------|---------------------------------------------- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! + +something else entirely + From 01cf8aec5d4f3c42e9f3cb1dd0e9bf99b3c1974e Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 14:01:48 +0530 Subject: [PATCH 308/479] Changelog update --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index fb9d99a0..41965c48 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,6 +2,7 @@ xxxx.x.x ========= ---- +* Fix #125: --pad_tables now pads table cells to make them look nice. * Fix #114: Break does not interrupt blockquotes * Deprecation warnings for URL retreival From a9141864c0179ba288e8125c952238682f1adebc Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 14:06:12 +0530 Subject: [PATCH 309/479] Fixed set definition for backward compatibility --- html2text/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index 726e5e8d..5fcb7278 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -260,7 +260,7 @@ def reformat_table(lines, right_margin): new_lines = [] for line in lines: cols = [x.rstrip() for x in line.split('|')] - if set(line.strip()) == {'-', '|'}: + if set(line.strip()) == set('-|): filler = '-' new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width)] From bbe01a77d9031ab7b6384fce422af0a8f94888f9 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 14:06:12 +0530 Subject: [PATCH 310/479] Fixed set definition for backward compatibility --- html2text/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index 726e5e8d..e8b0113e 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -260,7 +260,7 @@ def reformat_table(lines, right_margin): new_lines = [] for line in lines: cols = [x.rstrip() for x in line.split('|')] - if set(line.strip()) == {'-', '|'}: + if set(line.strip()) == set('-|'): filler = '-' new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width)] From 75030eaa8c89336ca321ef73c7870a277d6e514c Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 14:18:43 +0530 Subject: [PATCH 311/479] Add pad_table flag to test runner --- test/test_html2text.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_html2text.py b/test/test_html2text.py index 2d678ee0..ade0a693 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -170,6 +170,10 @@ def test_func(self): module_args['mark_code'] = True cmdline_args.append('--mark-code') + if base_fn.startswith('pad_table'): + module_args['pad_tables'] = True + cmdline_args.append('--pad_tables') + if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: test_func = None From ff58b478ea05f457725cc042d17bf870b3fd4375 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 14:24:46 +0530 Subject: [PATCH 312/479] Remove table marker regex --- html2text/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html2text/config.py b/html2text/config.py index 979dd3b0..a29ed93c 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -44,7 +44,6 @@ # For checking space-only lines on line 771 RE_SPACE = re.compile(r'\s\+') -RE_TABLE_HEADER_MARKER = re.compile(r'(.+\|?)\n(---\|?)+') RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s') RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') From 0a7bdb5664244fd5064fced9330b9cc7daf44349 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 17:18:15 +0530 Subject: [PATCH 313/479] Test shows off the padding --- test/pad_table.html | 12 +++++++----- test/pad_table.md | 28 +++++++++++++++------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/test/pad_table.html b/test/pad_table.html index 7be8f833..99d6e2c8 100644 --- a/test/pad_table.html +++ b/test/pad_table.html @@ -3,21 +3,23 @@

      This is a test document

      With some text, code, bolds and italics.

      This is second header

      Displaynone text

      - - + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      Content 1 2 200 Image!
      Content 1 longer Content 2 blah
      Content Content 2 blah
      t Content 2 blah blah blah
      - +
      Header 1 Header 2 Header 3
      Header 1 longer Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      -some content within the table
      +some content between the tables
      - +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      Content 1 Content 2 longer 200 Image!
      something else entirely diff --git a/test/pad_table.md b/test/pad_table.md index d19a1b11..4a970568 100644 --- a/test/pad_table.md +++ b/test/pad_table.md @@ -6,21 +6,23 @@ With some text, `code`, **bolds** and _italics_. Displaynone text -Header 1 | Header 2 | Header 3 -----------|-----------|---------------------------------------------- -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Header 1 | Header 2 | Header 3 +-----------------|-----------|---------------------------------------------- +Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 longer | Content 2 | blah +Content | Content 2 | blah +t | Content 2 | blah blah blah -Header 1 | Header 2 | Header 3 -----------|-----------|---------------------------------------------- -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Header 1 longer | Header 2 | Header 3 +----------------|-----------|---------------------------------------------- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! -some content within the table -Header 1 | Header 2 | Header 3 -----------|-----------|---------------------------------------------- -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +some content between the tables +Header 1 | Header 2 | Header 3 +----------|------------------|---------------------------------------------- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! something else entirely From 5c57c26f36fef87cf282f5f75e10f325a99d4d64 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 18:43:25 +0530 Subject: [PATCH 314/479] More varied tests --- test/pad_table.html | 6 +++--- test/pad_table.md | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/pad_table.html b/test/pad_table.html index 99d6e2c8..d2966aae 100644 --- a/test/pad_table.html +++ b/test/pad_table.html @@ -10,9 +10,9 @@ - - - +
      Header 1 longer Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 200 Image!
      + +
      H1 H2 H3
      C1 Content 2 x
      C123 Content 2 xyz
      some content between the tables
      diff --git a/test/pad_table.md b/test/pad_table.md index 4a970568..70ad82f4 100644 --- a/test/pad_table.md +++ b/test/pad_table.md @@ -12,17 +12,17 @@ Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! Content 1 longer | Content 2 | blah Content | Content 2 | blah t | Content 2 | blah blah blah - -Header 1 longer | Header 2 | Header 3 -----------------|-----------|---------------------------------------------- -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! - + +H1 | H2 | H3 +-----|-----------|----- +C1 | Content 2 | x +C123 | Content 2 | xyz + some content between the tables Header 1 | Header 2 | Header 3 ----------|------------------|---------------------------------------------- Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! - + something else entirely From 1b78d10969ae76a22fa3b39b26c26d92c72e0c9c Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 18:44:36 +0530 Subject: [PATCH 315/479] Tag based table detection --- html2text/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1ac6dbac..08a5a5ee 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -565,8 +565,12 @@ def handle_tag(self, tag, attrs, start): if tag == "table": if start: self.table_start = True + if self.pad_tables: + self.o("<"+config.TABLE_MARKER_FOR_PAD+">") + self.o(" \n") else: if self.pad_tables: + self.o("") self.o(" \n") if tag in ["td", "th"] and start: if self.split_next_td: From f84b5913b8952626ba635e5f89a24f8da7f0698d Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 28 May 2016 18:45:50 +0530 Subject: [PATCH 316/479] reformatting adds space after table. --- html2text/config.py | 2 ++ html2text/utils.py | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/html2text/config.py b/html2text/config.py index a29ed93c..7cd5ccfb 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -3,6 +3,8 @@ # Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 +# Marker to use for marking tables for padding post processing +TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding" # Escape all special characters. Output is less readable, but avoids # corner case formatting issues. ESCAPE_SNOB = 0 diff --git a/html2text/utils.py b/html2text/utils.py index e8b0113e..6ccb6008 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -254,7 +254,8 @@ def reformat_table(lines, right_margin): max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] for line in lines: cols = [x.rstrip() for x in line.split('|')] - max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)] + max_width = [max(len(x) + right_margin, old_len) + for x, old_len in zip(cols, max_width)] # reformat new_lines = [] @@ -276,19 +277,22 @@ def pad_tables_in_text(text, right_margin=1): Provide padding for tables in the text """ lines = text.split('\n') - table_buffer, altered_lines, table_widths, within_table = [], [], [], False + table_buffer, altered_lines, table_widths, table_started = [], [], [], False new_lines = [] for line in lines: - if '|' in line: - if not within_table: - within_table = True - table_buffer.append(line) - else: - if within_table: + # Toogle table started + if (config.TABLE_MARKER_FOR_PAD in line): + table_started = not table_started + if not table_started: table = reformat_table(table_buffer, right_margin) new_lines.extend(table) table_buffer = [] - within_table = False + new_lines.append('') + continue + # Process lines + if table_started: + table_buffer.append(line) + else: new_lines.append(line) new_text = '\n'.join(new_lines) return new_text From e1872ba264ca053e117565899f55b214f0f7ddb1 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 29 May 2016 15:58:45 +0530 Subject: [PATCH 317/479] - change pad_tables to pad-tables in the cli option - Add to usage --- docs/usage.md | 1 + html2text/cli.py | 2 +- test/test_html2text.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 8e34d409..22040bc7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -131,3 +131,4 @@ Command line options | `--mark-code` | Mark code with [code]...[/code] blocks | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. +| `--pad-tables` | Use padding to make tables look good. diff --git a/html2text/cli.py b/html2text/cli.py index 7ece36db..49bef762 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -24,7 +24,7 @@ class bcolors: # pragma: no cover version='%prog ' + ".".join(map(str, __version__)) ) p.add_option( - "--pad_tables", + "--pad-tables", dest="pad_tables", action="store_true", default=config.PAD_TABLES, diff --git a/test/test_html2text.py b/test/test_html2text.py index ade0a693..724d4d62 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -172,7 +172,7 @@ def test_func(self): if base_fn.startswith('pad_table'): module_args['pad_tables'] = True - cmdline_args.append('--pad_tables') + cmdline_args.append('--pad-tables') if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: test_func = None From df1d723e30e3c4acad2e89778a3487b1e7e173da Mon Sep 17 00:00:00 2001 From: arjoonn sharma Date: Sun, 29 May 2016 17:16:50 +0530 Subject: [PATCH 318/479] Table padding option (#130) Add Table padding option --- ChangeLog.rst | 1 + docs/usage.md | 1 + html2text/__init__.py | 22 ++++++++++++++---- html2text/cli.py | 8 +++++++ html2text/config.py | 3 +++ html2text/utils.py | 52 ++++++++++++++++++++++++++++++++++++++++++ test/pad_table.html | 26 +++++++++++++++++++++ test/pad_table.md | 28 +++++++++++++++++++++++ test/test_html2text.py | 4 ++++ 9 files changed, 141 insertions(+), 4 deletions(-) create mode 100644 test/pad_table.html create mode 100644 test/pad_table.md diff --git a/ChangeLog.rst b/ChangeLog.rst index fb9d99a0..41965c48 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,6 +2,7 @@ xxxx.x.x ========= ---- +* Fix #125: --pad_tables now pads table cells to make them look nice. * Fix #114: Break does not interrupt blockquotes * Deprecation warnings for URL retreival diff --git a/docs/usage.md b/docs/usage.md index 8e34d409..22040bc7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -131,3 +131,4 @@ Command line options | `--mark-code` | Mark code with [code]...[/code] blocks | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. +| `--pad-tables` | Use padding to make tables look good. diff --git a/html2text/__init__.py b/html2text/__init__.py index 4ace76b0..08a5a5ee 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -27,7 +27,8 @@ list_numbering_start, dumb_css_parser, escape_md_section, - skipwrap + skipwrap, + pad_tables_in_text ) __version__ = (2016, 4, 2) @@ -77,6 +78,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE self.wrap_links = config.WRAP_LINKS # covered in cli + self.pad_tables = config.PAD_TABLES # covered in cli self.tag_callback = None if out is None: # pragma: no cover @@ -130,7 +132,11 @@ def feed(self, data): def handle(self, data): self.feed(data) self.feed("") - return self.optwrap(self.close()) + markdown = self.optwrap(self.close()) + if self.pad_tables: + return pad_tables_in_text(markdown) + else: + return markdown def outtextf(self, s): self.outtextlist.append(s) @@ -556,8 +562,16 @@ def handle_tag(self, tag, attrs, start): self.o(''.format(tag)) else: - if tag == "table" and start: - self.table_start = True + if tag == "table": + if start: + self.table_start = True + if self.pad_tables: + self.o("<"+config.TABLE_MARKER_FOR_PAD+">") + self.o(" \n") + else: + if self.pad_tables: + self.o("") + self.o(" \n") if tag in ["td", "th"] and start: if self.split_next_td: self.o("| ") diff --git a/html2text/cli.py b/html2text/cli.py index 770f388d..49bef762 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -23,6 +23,13 @@ class bcolors: # pragma: no cover '%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__)) ) + p.add_option( + "--pad-tables", + dest="pad_tables", + action="store_true", + default=config.PAD_TABLES, + help="pad the cells to equal column width in tables" + ) p.add_option( "--no-wrap-links", dest="wrap_links", @@ -271,5 +278,6 @@ class bcolors: # pragma: no cover h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code h.wrap_links = options.wrap_links + h.pad_tables = options.pad_tables wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index 85bf47dc..7cd5ccfb 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -3,6 +3,8 @@ # Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 +# Marker to use for marking tables for padding post processing +TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding" # Escape all special characters. Output is less readable, but avoids # corner case formatting issues. ESCAPE_SNOB = 0 @@ -36,6 +38,7 @@ IGNORE_EMPHASIS = False MARK_CODE = False DECODE_ERRORS = 'strict' +PAD_TABLES = False # Convert links with same href and text to format if they are absolute links USE_AUTOMATIC_LINKS = True diff --git a/html2text/utils.py b/html2text/utils.py index 418c89cb..6ccb6008 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -244,3 +244,55 @@ def escape_md_section(text, snob=False): text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) return text + +def reformat_table(lines, right_margin): + """ + Given the lines of a table + padds the cells and returns the new lines + """ + # find the maximum width of the columns + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] + for line in lines: + cols = [x.rstrip() for x in line.split('|')] + max_width = [max(len(x) + right_margin, old_len) + for x, old_len in zip(cols, max_width)] + + # reformat + new_lines = [] + for line in lines: + cols = [x.rstrip() for x in line.split('|')] + if set(line.strip()) == set('-|'): + filler = '-' + new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width)] + else: + filler = ' ' + new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width)] + new_lines.append('|'.join(new_cols)) + return new_lines + +def pad_tables_in_text(text, right_margin=1): + """ + Provide padding for tables in the text + """ + lines = text.split('\n') + table_buffer, altered_lines, table_widths, table_started = [], [], [], False + new_lines = [] + for line in lines: + # Toogle table started + if (config.TABLE_MARKER_FOR_PAD in line): + table_started = not table_started + if not table_started: + table = reformat_table(table_buffer, right_margin) + new_lines.extend(table) + table_buffer = [] + new_lines.append('') + continue + # Process lines + if table_started: + table_buffer.append(line) + else: + new_lines.append(line) + new_text = '\n'.join(new_lines) + return new_text diff --git a/test/pad_table.html b/test/pad_table.html new file mode 100644 index 00000000..d2966aae --- /dev/null +++ b/test/pad_table.html @@ -0,0 +1,26 @@ + + +

      This is a test document

      With some text, code, bolds and italics.

      This is second header

      Displaynone text

      + + + + + + +
      Header 1 Header 2 Header 3
      Content 1 2 200 Image!
      Content 1 longer Content 2 blah
      Content Content 2 blah
      t Content 2 blah blah blah
      + + + + + +
      H1 H2 H3
      C1 Content 2 x
      C123 Content 2 xyz
      + +some content between the tables
      + + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 longer 200 Image!
      + +something else entirely + diff --git a/test/pad_table.md b/test/pad_table.md new file mode 100644 index 00000000..70ad82f4 --- /dev/null +++ b/test/pad_table.md @@ -0,0 +1,28 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 | Header 2 | Header 3 +-----------------|-----------|---------------------------------------------- +Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 longer | Content 2 | blah +Content | Content 2 | blah +t | Content 2 | blah blah blah + +H1 | H2 | H3 +-----|-----------|----- +C1 | Content 2 | x +C123 | Content 2 | xyz + +some content between the tables +Header 1 | Header 2 | Header 3 +----------|------------------|---------------------------------------------- +Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! +Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! + +something else entirely + diff --git a/test/test_html2text.py b/test/test_html2text.py index 2d678ee0..724d4d62 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -170,6 +170,10 @@ def test_func(self): module_args['mark_code'] = True cmdline_args.append('--mark-code') + if base_fn.startswith('pad_table'): + module_args['pad_tables'] = True + cmdline_args.append('--pad-tables') + if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: test_func = None From ffce225e5ec3ad9c8a97d8afb00aef5d452993e1 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 29 May 2016 20:13:08 +0400 Subject: [PATCH 319/479] Fix a typo on ChangeLog --- ChangeLog.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 41965c48..cefed9dc 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,7 +4,8 @@ xxxx.x.x * Fix #125: --pad_tables now pads table cells to make them look nice. * Fix #114: Break does not interrupt blockquotes -* Deprecation warnings for URL retreival +* Deprecation warnings for URL retrieval. + 2016.4.2 ========= From 7478af87bc561c43143ee6217a729ecb482e7871 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 29 May 2016 20:14:03 +0400 Subject: [PATCH 320/479] Remove Python 3.1 from the classifiers --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f79e45d8..f8c2d249 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ # coding: utf-8 import sys + from setuptools import setup, Command, find_packages + try: from pypandoc import convert read_md = lambda f: convert(f, 'rst') @@ -18,7 +20,8 @@ class RunTests(Command): - """New setup.py command to run all tests for the package. + """ + New setup.py command to run all tests for the package. """ description = "run all tests for the package" @@ -62,7 +65,6 @@ def run(self): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.1', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', From 751d1ce5f1c755003488a4248664982248c3d253 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 29 May 2016 20:14:22 +0400 Subject: [PATCH 321/479] Bump the version to today for release --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index cefed9dc..b9b1dfcf 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -xxxx.x.x +2016.5.29 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 08a5a5ee..7311d9bb 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -31,7 +31,7 @@ pad_tables_in_text ) -__version__ = (2016, 4, 2) +__version__ = (2016, 5, 29) # TODO: From 0e8caba8184646d254bb6c8fa910b7c86dfccde4 Mon Sep 17 00:00:00 2001 From: Mac Date: Wed, 22 Jun 2016 00:11:25 -0500 Subject: [PATCH 322/479] CSS case-insensitive fix (#134) * Change CSS dict to set all strings to lowercase CSS is case insensitive (see http://reference.sitepoint.com/css/casesensitivity), therefore the dictionary of CSS elements should also be case insensitive. Casting all keys and values in the CSS dictionary to lowercase is a quick solution. This requires checks against CSS properties to be lower case, for example checking for `courier` instead of `Courier`. * Add tests for case-insensitive CSS properties Tests are just the same font-property string as all upper and all lowercase. --- html2text/utils.py | 4 ++-- test/google-like_font-properties.html | 15 +++++++++++++++ test/google-like_font-properties.md | 6 ++++++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 test/google-like_font-properties.html create mode 100644 test/google-like_font-properties.md diff --git a/html2text/utils.py b/html2text/utils.py index 6ccb6008..bd6fc634 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -31,7 +31,7 @@ def dumb_property_dict(style): """ :returns: A hash of css attributes """ - out = dict([(x.strip(), y.strip()) for x, y in + out = dict([(x.strip().lower(), y.strip().lower()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z ] @@ -149,7 +149,7 @@ def google_fixed_width_font(style): font_family = '' if 'font-family' in style: font_family = style['font-family'] - if 'Courier New' == font_family or 'Consolas' == font_family: + if 'courier new' == font_family or 'consolas' == font_family: return True return False diff --git a/test/google-like_font-properties.html b/test/google-like_font-properties.html new file mode 100644 index 00000000..cff3ebd6 --- /dev/null +++ b/test/google-like_font-properties.html @@ -0,0 +1,15 @@ + + + CAPS-LOCK TEST + + +

      font-weight: bold

      +

      FONT-WEIGHT: BOLD

      +

      font-style: italic

      +

      FONT-STYLE: ITALIC

      +

      + font-weight: bold;font-style: italic

      +

      + FONT-WEIGHT: BOLD;FONT-STYLE: ITALIC

      + + diff --git a/test/google-like_font-properties.md b/test/google-like_font-properties.md new file mode 100644 index 00000000..c7a0312d --- /dev/null +++ b/test/google-like_font-properties.md @@ -0,0 +1,6 @@ +**font-weight: bold** +**FONT-WEIGHT: BOLD** +_font-style: italic_ +_FONT-STYLE: ITALIC_ +_**font-weight: bold;font-style: italic**_ +_**FONT-WEIGHT: BOLD;FONT-STYLE: ITALIC**_ From b4c600a5b534851379cbb9b59ad3d142ba3fe4ef Mon Sep 17 00:00:00 2001 From: Nicolas Brisac Date: Wed, 29 Jun 2016 11:45:55 +0200 Subject: [PATCH 323/479] add option to ignore tables (#135) * add option to ignore tables * fixed test syntax --- docs/how_it_works.md | 6 ++++-- docs/usage.md | 4 +++- html2text/__init__.py | 12 +++++++++++- html2text/cli.py | 8 ++++++++ html2text/config.py | 4 ++++ test/table_ignore.html | 26 ++++++++++++++++++++++++++ test/table_ignore.md | 22 ++++++++++++++++++++++ test/test_html2text.py | 6 +++++- 8 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 test/table_ignore.html create mode 100644 test/table_ignore.md diff --git a/docs/how_it_works.md b/docs/how_it_works.md index 4e6891ed..725e200b 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -29,7 +29,8 @@ Used to provide various configuration settings to the converter. They are as fol - IMAGES_TO_ALT - IMAGES_WITH_SIZE - IGNORE_EMPHASIS - - BYPASS_TABLES + - BYPASS_TABLES format tables in HTML rather than Markdown + - IGNORE_TABLES ignore table-related tags (table, th, td, tr) while keeping rows - SINGLE_LINE_BREAK to use a single line break rather than two - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII values @@ -51,7 +52,7 @@ utils.py Used to provide utility functions to html2text Some functions are: - + - name2cp :name to code point - hn :headings - dumb_preperty_dict :hash of css attrs @@ -92,6 +93,7 @@ Command line interface for the code. |`-s`, `--hide-strikethrough` | Hide strike-through text. only relevant when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. +| `--ignore-tables` | Ignore table-related tags (table, th, td, tr) while keeping rows. | `--single-line-break` | Use a single line break after a block element rather than two. | `--reference-links` | Use reference links instead of inline links to create markdown diff --git a/docs/usage.md b/docs/usage.md index 22040bc7..d7317f6a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -75,7 +75,8 @@ simple indications of their function. - IMAGES_TO_ALT - IMAGES_WITH_SIZE - IGNORE_EMPHASIS - - BYPASS_TABLES + - BYPASS_TABLES format tables in HTML rather than Markdown + - IGNORE_TABLES ignore table-related tags (table, th, td, tr) while keeping rows - SINGLE_LINE_BREAK to use a single line break rather than two - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII values @@ -120,6 +121,7 @@ Command line options |`-s`, `--hide-strikethrough` | Hide strike-through text. only relevant when `-g` is specified as well |`--escape-all` | Escape all special characters. Output is less readable, but avoids corner case formatting issues. | `--bypass-tables` | Format tables in HTML rather than Markdown syntax. +| `--ignore-tables` | Ignore table-related tags (table, th, td, tr) while keeping rows. | `--single-line-break` | Use a single line break after a block element rather than two. | `--reference-links` | Use reference links instead of inline links to create markdown | `--ignore-emphasis` | Ignore all emphasis formatting in the html. diff --git a/html2text/__init__.py b/html2text/__init__.py index 7311d9bb..bcf22a41 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -69,6 +69,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.ignore_tables = config.IGNORE_TABLES # covered in cli self.google_doc = False # covered in cli self.ul_item_mark = '*' # covered in cli self.emphasis_mark = '_' # covered in cli @@ -547,7 +548,16 @@ def handle_tag(self, tag, attrs, start): self.start = 1 if tag in ["table", "tr", "td", "th"]: - if self.bypass_tables: + if self.ignore_tables: + if tag == 'tr': + if start: + pass + else: + self.soft_br() + else: + pass + + elif self.bypass_tables: if start: self.soft_br() if tag in ["td", "th"]: diff --git a/html2text/cli.py b/html2text/cli.py index 49bef762..8fb5baa6 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -146,6 +146,13 @@ class bcolors: # pragma: no cover default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax." ) + p.add_option( + "--ignore-tables", + action="store_true", + dest="ignore_tables", + default=config.IGNORE_TABLES, + help="Ignore table-related tags (table, th, td, tr) while keeping rows." + ) p.add_option( "--single-line-break", action="store_true", @@ -270,6 +277,7 @@ class bcolors: # pragma: no cover h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables + h.ignore_tables = options.ignore_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links h.unicode_snob = options.unicode_snob diff --git a/html2text/config.py b/html2text/config.py index 7cd5ccfb..75ee655a 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -119,7 +119,11 @@ 'rlm': '' } +# Format tables in HTML rather than Markdown syntax BYPASS_TABLES = False +# Ignore table-related tags (table, th, td, tr) while keeping rows +IGNORE_TABLES = False + # Use a single line break after a block element rather an two line breaks. # NOTE: Requires body width setting to be 0. diff --git a/test/table_ignore.html b/test/table_ignore.html new file mode 100644 index 00000000..d2966aae --- /dev/null +++ b/test/table_ignore.html @@ -0,0 +1,26 @@ + + +

      This is a test document

      With some text, code, bolds and italics.

      This is second header

      Displaynone text

      + + + + + + +
      Header 1 Header 2 Header 3
      Content 1 2 200 Image!
      Content 1 longer Content 2 blah
      Content Content 2 blah
      t Content 2 blah blah blah
      + + + + + +
      H1 H2 H3
      C1 Content 2 x
      C123 Content 2 xyz
      + +some content between the tables
      + + + + +
      Header 1 Header 2 Header 3
      Content 1 Content 2 200 Image!
      Content 1 Content 2 longer 200 Image!
      + +something else entirely + diff --git a/test/table_ignore.md b/test/table_ignore.md new file mode 100644 index 00000000..ce2dedac --- /dev/null +++ b/test/table_ignore.md @@ -0,0 +1,22 @@ +# This is a test document + +With some text, `code`, **bolds** and _italics_. + +## This is second header + +Displaynone text + +Header 1 Header 2 Header 3 +Content 1 2 ![200](http://lorempixel.com/200/200) Image! +Content 1 longer Content 2 blah +Content Content 2 blah +t Content 2 blah blah blah +H1 H2 H3 +C1 Content 2 x +C123 Content 2 xyz +some content between the tables +Header 1 Header 2 Header 3 +Content 1 Content 2 ![200](http://lorempixel.com/200/200) Image! +Content 1 Content 2 longer ![200](http://lorempixel.com/200/200) Image! +something else entirely + diff --git a/test/test_html2text.py b/test/test_html2text.py index 724d4d62..1af5e69a 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -134,6 +134,10 @@ def test_func(self): module_args['bypass_tables'] = True cmdline_args.append('--bypass-tables') + if base_fn.startswith('table_ignore'): + module_args['ignore_tables'] = True + cmdline_args.append('--ignore-tables') + if base_fn.startswith('bodywidth'): # module_args['unicode_snob'] = True module_args['body_width'] = 0 @@ -161,7 +165,7 @@ def test_func(self): if base_fn.startswith('no_inline_links'): module_args['inline_links'] = False cmdline_args.append('--reference-links') - + if base_fn.startswith('no_wrap_links'): module_args['wrap_links'] = False cmdline_args.append('--no-wrap-links') From 0add20636338f21a298242e5e380d69ec0a215f3 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 9 Jul 2016 15:26:41 +0530 Subject: [PATCH 324/479] - Added tests for default image alt text - added config, cli, __init__ changes - the __init__ change uses short circuiting behaviour of `or` in python - cli adds an option to set this via command line --- html2text/__init__.py | 3 ++- html2text/cli.py | 7 +++++++ html2text/config.py | 1 + test/default_image_alt.html | 1 + test/default_image_alt.md | 2 ++ 5 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 test/default_image_alt.html create mode 100644 test/default_image_alt.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 7311d9bb..14e964fa 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -79,6 +79,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.mark_code = config.MARK_CODE self.wrap_links = config.WRAP_LINKS # covered in cli self.pad_tables = config.PAD_TABLES # covered in cli + self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli self.tag_callback = None if out is None: # pragma: no cover @@ -445,7 +446,7 @@ def handle_tag(self, tag, attrs, start): if 'src' in attrs: if not self.images_to_alt: attrs['href'] = attrs['src'] - alt = attrs.get('alt') or '' + alt = attrs.get('alt') or self.default_image_alt or '' # If we have images_with_size, write raw html including width, # height, and alt attributes diff --git a/html2text/cli.py b/html2text/cli.py index 49bef762..5f7c6e53 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -23,6 +23,13 @@ class bcolors: # pragma: no cover '%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__)) ) + p.add_option( + "--default-image-alt", + dest="default_image_alt", + action="store", + type="str", + default=config.DEFAULT_IMAGE_ALT, + help="The default alt string for images with missing ones") p.add_option( "--pad-tables", dest="pad_tables", diff --git a/html2text/config.py b/html2text/config.py index 7cd5ccfb..9533026c 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -38,6 +38,7 @@ IGNORE_EMPHASIS = False MARK_CODE = False DECODE_ERRORS = 'strict' +DEFAULT_IMAGE_ALT = 'Image' PAD_TABLES = False # Convert links with same href and text to format if they are absolute links diff --git a/test/default_image_alt.html b/test/default_image_alt.html new file mode 100644 index 00000000..b1f04def --- /dev/null +++ b/test/default_image_alt.html @@ -0,0 +1 @@ +
      diff --git a/test/default_image_alt.md b/test/default_image_alt.md new file mode 100644 index 00000000..5c4f3c80 --- /dev/null +++ b/test/default_image_alt.md @@ -0,0 +1,2 @@ +[![Image](images/google.png)](http://google.com) + From 947a7e5a9ed22cb296c86004a596f68a64aadf96 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 9 Jul 2016 15:33:24 +0530 Subject: [PATCH 325/479] Mention image-alt fix in changelog --- ChangeLog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index b9b1dfcf..dcb48c05 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +xxxx.x.xx +========= +---- + +* Default image alt text option created and set to a default of "Image" +* Fix #136: --default-image-alt now takes a string as argument + 2016.5.29 ========= ---- From 64fe95e34ec6d63abd1ae96d25b761de02c09493 Mon Sep 17 00:00:00 2001 From: Giles Brown Date: Sat, 9 Jul 2016 03:04:57 -0700 Subject: [PATCH 326/479] Remove check for '/script>' in handle_data to fix #133 (#137) --- html2text/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index bcf22a41..3d090322 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -731,9 +731,6 @@ def o(self, data, puredata=0, force=0): self.outcount += 1 def handle_data(self, data, entity_char=False): - if r'\/script>' in data: - self.quiet -= 1 - if self.style: self.style_def.update(dumb_css_parser(data)) From ab973d0f38c7fe1994bb5fb03d18869fec0312af Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 9 Jul 2016 15:37:47 +0530 Subject: [PATCH 327/479] Added default-image-alt to docs --- docs/usage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 22040bc7..5a1a6874 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -94,6 +94,7 @@ simple indications of their function. - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False) - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values. + - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is 'Image' To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. @@ -132,3 +133,4 @@ Command line options | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. | `--pad-tables` | Use padding to make tables look good. +| `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whever images are missing `alt` values. From a4f975d82ce42f93bbca6d0b3e17248b27c0f47a Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sat, 9 Jul 2016 15:39:26 +0530 Subject: [PATCH 328/479] Added #137 to changelog --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index dcb48c05..2dd95409 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,6 +4,7 @@ xxxx.x.xx * Default image alt text option created and set to a default of "Image" * Fix #136: --default-image-alt now takes a string as argument +* Fix #113: Stop changing quiet levels on \/script tags. 2016.5.29 ========= From e5fd147e1be500eff3aa8bae24bf1dfef88e97dd Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 10 Jul 2016 08:18:17 +0530 Subject: [PATCH 329/479] Fix tests which broke on backward compatibility - Make default IMAGE_ALT = '' - Fix cli option --- docs/usage.md | 2 +- html2text/__init__.py | 2 +- html2text/cli.py | 1 + html2text/config.py | 2 +- test/test_html2text.py | 4 ++++ 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 5a1a6874..d0ae62c0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -94,7 +94,7 @@ simple indications of their function. - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False) - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values. - - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is 'Image' + - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. diff --git a/html2text/__init__.py b/html2text/__init__.py index 14e964fa..5d7cdeca 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -446,7 +446,7 @@ def handle_tag(self, tag, attrs, start): if 'src' in attrs: if not self.images_to_alt: attrs['href'] = attrs['src'] - alt = attrs.get('alt') or self.default_image_alt or '' + alt = attrs.get('alt') or self.default_image_alt # If we have images_with_size, write raw html including width, # height, and alt attributes diff --git a/html2text/cli.py b/html2text/cli.py index 5f7c6e53..eb832bb4 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -286,5 +286,6 @@ class bcolors: # pragma: no cover h.mark_code = options.mark_code h.wrap_links = options.wrap_links h.pad_tables = options.pad_tables + h.default_image_alt = options.default_image_alt wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index 9533026c..76f693f0 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -38,7 +38,7 @@ IGNORE_EMPHASIS = False MARK_CODE = False DECODE_ERRORS = 'strict' -DEFAULT_IMAGE_ALT = 'Image' +DEFAULT_IMAGE_ALT = '' PAD_TABLES = False # Convert links with same href and text to format if they are absolute links diff --git a/test/test_html2text.py b/test/test_html2text.py index 724d4d62..48d63a4a 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -114,6 +114,10 @@ def test_func(self): func_args = {} base_fn = os.path.basename(fn).lower() + if base_fn.startswith('default_image_alt'): + module_args['default_image_alt'] = 'Image' + cmdline_args.append('--default-image-alt=Image') + if base_fn.startswith('google'): module_args['google_doc'] = True cmdline_args.append('--googledoc') From 6da08ee83ad5da99c594f3ce06b1ffb8d39bad53 Mon Sep 17 00:00:00 2001 From: Arjoonn Sharma Date: Sun, 10 Jul 2016 08:19:53 +0530 Subject: [PATCH 330/479] Update changelog with default image alt as empty --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 2dd95409..92604eae 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,7 +2,7 @@ xxxx.x.xx ========= ---- -* Default image alt text option created and set to a default of "Image" +* Default image alt text option created and set to a default of empty string "" to maintain backward compatibility * Fix #136: --default-image-alt now takes a string as argument * Fix #113: Stop changing quiet levels on \/script tags. From d3247c2ada04c63f9c6d08953cd5dc2dd281a69e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruno=20Reni=C3=A9?= Date: Wed, 20 Jul 2016 15:18:43 +0200 Subject: [PATCH 331/479] Fix DeprecationWarning on Python3: use html.escape (#126) * Fix DeprecationWarning on Python3: use html.escape * Move `escape` to the compat module. Also fix the apostrophe test in Python 3. This caused by a behavior change in `html.escape` compared to `cgi.escape`. `cgi.escape` only quotes double quotes, whereas `html.escape` also takes single quotes (apostrophe) into account. The workaround is to pass `quote=False` to `html.escape`. See https://docs.python.org/2/library/cgi.html#cgi.escape and https://docs.python.org/3/library/html.html#html.escape --- html2text/__init__.py | 7 +++---- html2text/compat.py | 4 ++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index b9d2da0c..471a46e0 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -4,14 +4,13 @@ from __future__ import division import re import sys -import cgi try: from textwrap import wrap except ImportError: # pragma: no cover pass -from html2text.compat import urlparse, HTMLParser +from html2text.compat import urlparse, HTMLParser, html_escape from html2text import config from html2text.utils import ( @@ -178,14 +177,14 @@ def close(self): def handle_charref(self, c): charref = self.charref(c) if not self.code and not self.pre: - charref = cgi.escape(charref) + charref = html_escape(charref) self.handle_data(charref, True) def handle_entityref(self, c): entityref = self.entityref(c) if (not self.code and not self.pre and entityref != ' _place_holder;'): - entityref = cgi.escape(entityref) + entityref = html_escape(entityref) self.handle_data(entityref, True) def handle_starttag(self, tag, attrs): diff --git a/html2text/compat.py b/html2text/compat.py index 2120a41b..60907abf 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -6,8 +6,12 @@ import urlparse import HTMLParser import urllib + from cgi import escape as html_escape else: import urllib.parse as urlparse import html.entities as htmlentitydefs import html.parser as HTMLParser import urllib.request as urllib + from html import escape + def html_escape(s): + return escape(s, quote=False) From 1beb8c611b5f457eb2cdda4b4d6235b55bb9bbcc Mon Sep 17 00:00:00 2001 From: arjoonn sharma Date: Mon, 25 Jul 2016 16:52:47 +0530 Subject: [PATCH 332/479] Update for #126 --- ChangeLog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 92604eae..4ae5b999 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,6 +5,7 @@ xxxx.x.xx * Default image alt text option created and set to a default of empty string "" to maintain backward compatibility * Fix #136: --default-image-alt now takes a string as argument * Fix #113: Stop changing quiet levels on \/script tags. +* Merge #126: Fix deprecation warning on py3 due to html.escape 2016.5.29 ========= From d1aa1af73b9f14252795b08ee3c24008b0b4ad10 Mon Sep 17 00:00:00 2001 From: arjoonn sharma Date: Wed, 31 Aug 2016 18:27:02 +0530 Subject: [PATCH 333/479] Docs link Make docs link absolute to resolve #141 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 20967f47..56572c40 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Usage: `html2text [(filename|url) [encoding]]` | `--reference-links` | Use reference links instead of links to create markdown | `--mark-code` | Mark preformatted and code blocks with [code]...[/code] -For a complete list of options see the [docs](docs/usage.md) +For a complete list of options see the [docs](https://github.com/Alir3z4/html2text/blob/master/docs/usage.md) Or you can use it from within `Python`: @@ -84,4 +84,4 @@ then open the `./htmlcov/index.html` file in your browser. ## Documentation -Documentation lives [here](docs/index.md) +Documentation lives [here](https://github.com/Alir3z4/html2text/blob/master/docs/usage.md) From 6e097d01ef2ae9d691487dde50fe05ab62bf257c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Fri, 16 Sep 2016 03:08:28 +0400 Subject: [PATCH 334/479] Trying to fix Travis CI build for Python 2.6 (#145) --- .travis.yml | 2 +- ChangeLog.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fec022bd..4d592e83 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ python: install: - pip install coveralls==0.5 before_script: - - '[ "${TRAVIS_PYTHON_VERSION}" = "2.6" ] && pip install --use-mirrors unittest2 || /bin/true' + - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - export COVERAGE_PROCESS_START=$PWD/.coveragerc script: - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text --rcfile=.coveragerc setup.py test -v diff --git a/ChangeLog.rst b/ChangeLog.rst index 4ae5b999..846d66f6 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -6,6 +6,7 @@ xxxx.x.xx * Fix #136: --default-image-alt now takes a string as argument * Fix #113: Stop changing quiet levels on \/script tags. * Merge #126: Fix deprecation warning on py3 due to html.escape +* Fix #145: Running test suite on Travis CI for Python 2.6. 2016.5.29 ========= From 28f688fa5f1de3393907f68afa9c78758804d42e Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 19 Sep 2016 02:01:18 +0400 Subject: [PATCH 335/479] Add vscode project conf to be ignored --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a7b8dd2f..3881b4cc 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ dist .coverage .coverage.* env/ -.c9/ \ No newline at end of file +.c9/ +.vscode From ba5e670bc676492a22a89013e0cb56a3af6531a1 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 19 Sep 2016 02:04:31 +0400 Subject: [PATCH 336/479] Bump the version number for a release --- ChangeLog.rst | 3 ++- html2text/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 846d66f6..fd66cb01 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -xxxx.x.xx +2016.9.19 ========= ---- @@ -8,6 +8,7 @@ xxxx.x.xx * Merge #126: Fix deprecation warning on py3 due to html.escape * Fix #145: Running test suite on Travis CI for Python 2.6. + 2016.5.29 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 471a46e0..c6ed1e10 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -30,7 +30,7 @@ pad_tables_in_text ) -__version__ = (2016, 5, 29) +__version__ = (2016, 9, 19) # TODO: From 83e53adb90afa9769ed99f343c444b099e47d498 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Sun, 5 Mar 2017 21:51:55 +0100 Subject: [PATCH 337/479] Fix typos --- ChangeLog.rst | 2 +- docs/how_it_works.md | 4 ++-- docs/usage.md | 2 +- html2text/config.py | 4 ++-- html2text/utils.py | 2 +- test/empty-link.html | 2 +- test/empty-link.md | 2 +- test/list_tags_example.html | 2 +- test/list_tags_example.md | 2 +- test/pre.html | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index fd66cb01..6d51b457 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -124,7 +124,7 @@ ========== ---- -* Feature #49: Added a images_to_alt option to discard images and keep only their alt. +* Feature #49: Added an images_to_alt option to discard images and keep only their alt. * Feature #50: Protect links, surrounding them with angle brackets to avoid breaking... * Feature: Add ``setup.cfg`` file. diff --git a/docs/how_it_works.md b/docs/how_it_works.md index 725e200b..1b52af31 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -21,7 +21,7 @@ Used to provide various configuration settings to the converter. They are as fol - LINKS_EACH_PARAGRAPH for putting links after every paragraph - BODY_WIDTH for wrapping long lines - SKIP_INTERNAL_LINKS to skip #local-anchor things - - INLNE_LINKS for formatting images and links + - INLINE_LINKS for formatting images and links - PROTECT_LINKS protect from line breaks - GOOGLE_LIST_INDENT no of pixels to indent nested lists - IGNORE_ANCHORS @@ -55,7 +55,7 @@ Some functions are: - name2cp :name to code point - hn :headings - - dumb_preperty_dict :hash of css attrs + - dumb_property_dict :hash of css attrs - dumb_css_parser :returns a hash of css selectors, each containing a hash of css attrs - element_style :hash of final style of element diff --git a/docs/usage.md b/docs/usage.md index eb2c3bc0..8a81629f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -135,4 +135,4 @@ Command line options | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. | `--pad-tables` | Use padding to make tables look good. -| `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whever images are missing `alt` values. +| `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values. diff --git a/html2text/config.py b/html2text/config.py index 48e17845..b06e94a4 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -1,6 +1,6 @@ import re -# Use Unicode characters instead of their ascii psuedo-replacements +# Use Unicode characters instead of their ascii pseudo-replacements UNICODE_SNOB = 0 # Marker to use for marking tables for padding post processing @@ -126,6 +126,6 @@ IGNORE_TABLES = False -# Use a single line break after a block element rather an two line breaks. +# Use a single line break after a block element rather than two line breaks. # NOTE: Requires body width setting to be 0. SINGLE_LINE_BREAK = False diff --git a/html2text/utils.py b/html2text/utils.py index bd6fc634..ff510a81 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -280,7 +280,7 @@ def pad_tables_in_text(text, right_margin=1): table_buffer, altered_lines, table_widths, table_started = [], [], [], False new_lines = [] for line in lines: - # Toogle table started + # Toggle table started if (config.TABLE_MARKER_FOR_PAD in line): table_started = not table_started if not table_started: diff --git a/test/empty-link.html b/test/empty-link.html index 7f1c3eb1..dad41d09 100644 --- a/test/empty-link.html +++ b/test/empty-link.html @@ -1,6 +1,6 @@

      Processing empty hyperlinks

      -

      This test checks wheter empty hyperlinks still appear in the markdown result.

      +

      This test checks whether empty hyperlinks still appear in the markdown result.

      \ No newline at end of file diff --git a/test/empty-link.md b/test/empty-link.md index fbaf01bc..7a38588f 100644 --- a/test/empty-link.md +++ b/test/empty-link.md @@ -1,6 +1,6 @@ # Processing empty hyperlinks -This test checks wheter empty hyperlinks still appear in the markdown result. +This test checks whether empty hyperlinks still appear in the markdown result. [](http://some.link) diff --git a/test/list_tags_example.html b/test/list_tags_example.html index d0a8be2d..9a5ba7ab 100644 --- a/test/list_tags_example.html +++ b/test/list_tags_example.html @@ -34,6 +34,6 @@

      Example 2

      -
    • somthing else here
    • +
    • something else here
    • some item
    diff --git a/test/list_tags_example.md b/test/list_tags_example.md index 151249d6..71568ef7 100644 --- a/test/list_tags_example.md +++ b/test/list_tags_example.md @@ -33,6 +33,6 @@ Drums 2. some item 3. some item - * somthing else here + * something else here * some item diff --git a/test/pre.html b/test/pre.html index 171074b7..9872fc24 100644 --- a/test/pre.html +++ b/test/pre.html @@ -1,6 +1,6 @@ - initial crowsed pre handling test #1 + initial crowded pre handling test #1
    a
    
    From b825e34008b79ea70486ab216a4b7330c82e026d Mon Sep 17 00:00:00 2001
    From: Alireza Savand 
    Date: Thu, 16 Mar 2017 15:47:38 +0400
    Subject: [PATCH 338/479] Placeholder for future changelogs
    
    ---
     ChangeLog.rst | 5 +++++
     1 file changed, 5 insertions(+)
    
    diff --git a/ChangeLog.rst b/ChangeLog.rst
    index 6d51b457..743e9ea4 100644
    --- a/ChangeLog.rst
    +++ b/ChangeLog.rst
    @@ -1,3 +1,8 @@
    +0000.00.00
    +==========
    +----
    +
    +
     2016.9.19
     =========
     ----
    
    From 3b6f059dfa04cc8694689ca73d6fcfd60a5fb95e Mon Sep 17 00:00:00 2001
    From: Phus Lu 
    Date: Wed, 15 Mar 2017 13:45:24 +0800
    Subject: [PATCH 339/479] fix images link with div wrap
    
    Signed-off-by: Phus Lu 
    ---
     html2text/__init__.py          | 2 ++
     test/images_with_div_wrap.html | 1 +
     test/images_with_div_wrap.md   | 2 ++
     3 files changed, 5 insertions(+)
     create mode 100644 test/images_with_div_wrap.html
     create mode 100644 test/images_with_div_wrap.md
    
    diff --git a/html2text/__init__.py b/html2text/__init__.py
    index c6ed1e10..8a636382 100644
    --- a/html2text/__init__.py
    +++ b/html2text/__init__.py
    @@ -331,6 +331,8 @@ def handle_tag(self, tag, attrs, start):
                         self.p()
                     else:
                         self.soft_br()
    +            elif self.astack and tag == 'div':
    +                pass
                 else:
                     self.p()
     
    diff --git a/test/images_with_div_wrap.html b/test/images_with_div_wrap.html
    new file mode 100644
    index 00000000..6fa6678a
    --- /dev/null
    +++ b/test/images_with_div_wrap.html
    @@ -0,0 +1 @@
    +
    diff --git a/test/images_with_div_wrap.md b/test/images_with_div_wrap.md new file mode 100644 index 00000000..92c37228 --- /dev/null +++ b/test/images_with_div_wrap.md @@ -0,0 +1,2 @@ +[![](http://example.com/img.png)](http://example.com) + From ce7ab6a891a2a76758834a3a90ffcd599d8b6849 Mon Sep 17 00:00:00 2001 From: Phus Lu Date: Thu, 16 Mar 2017 20:32:13 +0800 Subject: [PATCH 340/479] Add https://github.com/Alir3z4/html2text/pull/157 to changelog Signed-off-by: Phus Lu --- ChangeLog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 743e9ea4..f5bb2f09 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,6 +2,8 @@ ========== ---- +* Merge #157: Fix images link with div wrap + 2016.9.19 ========= From 69f94d3154af83efd7a42976f5b6f7844150f11d Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 16 Mar 2017 16:41:36 +0400 Subject: [PATCH 341/479] Update ChangeLog.rst --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index f5bb2f09..e1a62eec 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,7 +2,7 @@ ========== ---- -* Merge #157: Fix images link with div wrap +* Fix #157: Fix images link with div wrap 2016.9.19 From 2daf4e5d0cb45e36137f851e4c2400eb585680d3 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 24 Mar 2017 16:36:39 +0000 Subject: [PATCH 342/479] Fixed issue #55: Error when empty title on tag --- html2text/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 8a636382..0612b717 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -427,7 +427,10 @@ def handle_tag(self, tag, attrs, start): self.maybe_automatic_link = None if self.inline_links: try: - title = escape_md(a['title']) + if a['title'] is not None: + title = escape_md(a['title']) + else: + title = "" except KeyError: self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")") else: From 03d44591999b878d4260facc09804ca4a6adf84e Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 24 Mar 2017 16:41:48 +0000 Subject: [PATCH 343/479] Added info to authors and changelogs files --- AUTHORS.rst | 1 + ChangeLog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 93c19380..8d9de81c 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -20,6 +20,7 @@ The AUTHORS/Contributors are (and/or have been): * Etienne Millon * John C F * Mikhail Melnik +* Andres Rey Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index e1a62eec..59a4f594 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ ---- * Fix #157: Fix images link with div wrap +* Fix #55: Fix error when empty title tags 2016.9.19 From d37b0416466797a1f0c254cab6b12d8001e95c96 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 24 Mar 2017 16:53:07 +0000 Subject: [PATCH 344/479] Added basic unit testing --- test/empty-title-tag.html | 1 + test/empty-title-tag.md | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 test/empty-title-tag.html create mode 100644 test/empty-title-tag.md diff --git a/test/empty-title-tag.html b/test/empty-title-tag.html new file mode 100644 index 00000000..bc8d3c57 --- /dev/null +++ b/test/empty-title-tag.html @@ -0,0 +1 @@ +This is an A tag with an empty title property \ No newline at end of file diff --git a/test/empty-title-tag.md b/test/empty-title-tag.md new file mode 100644 index 00000000..0dc02eda --- /dev/null +++ b/test/empty-title-tag.md @@ -0,0 +1,2 @@ +[This is an A tag with an empty title property](test.html "" ) + From f060da2fc5cb9ae809d8990e5e28c03b6073595f Mon Sep 17 00:00:00 2001 From: Ciprian Miclaus Date: Fri, 21 Apr 2017 16:04:04 +0100 Subject: [PATCH 345/479] fix the test failures on windows/cygwin due to \r --- html2text/__init__.py | 2 +- test/test_html2text.py | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 0612b717..de94b19c 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -660,7 +660,7 @@ def o(self, data, puredata=0, force=0): if self.startpre: #self.out(" :") #TODO: not output when already one there - if not data.startswith("\n"): #
    stuff...
    +                if not data.startswith("\n") and not data.startswith("\r\n"):  # 
    stuff...
                         data = "\n" + data
                     if self.mark_code:
                         self.out("\n[code]")
    diff --git a/test/test_html2text.py b/test/test_html2text.py
    index 8a0259d2..5c9c6795 100644
    --- a/test/test_html2text.py
    +++ b/test/test_html2text.py
    @@ -17,6 +17,15 @@
     import html2text
     
     
    +def cleanup_eol(clean_str):
    +    if os.name == 'nt' or sys.platform == 'cygwin':
    +        # Fix the unwanted CR to CRCRLF replacement
    +        # during text pipelining on Windows/cygwin
    +        # on cygwin, os.name == 'posix', not nt
    +        clean_str = re.sub(r'\r+', '\r', clean_str)
    +        clean_str = clean_str.replace('\r\n', '\n')
    +    return clean_str
    +
     def test_module(fn, google_doc=False, **kwargs):
         h = html2text.HTML2Text()
         h.fn = fn
    @@ -31,9 +40,9 @@ def test_module(fn, google_doc=False, **kwargs):
             setattr(h, k, v)
     
         result = get_baseline(fn)
    -    inf = open(fn)
    -    actual = h.handle(inf.read())
    -    inf.close()
    +    with open(fn) as inf:
    +        actual = cleanup_eol(inf.read())
    +        actual = h.handle(actual)
         return result, actual
     
     
    @@ -56,11 +65,7 @@ def test_command(fn, *args):
     
         actual = out.decode('utf8')
     
    -    if os.name == 'nt':
    -        # Fix the unwanted CR to CRCRLF replacement
    -        # during text pipelining on Windows/cygwin
    -        actual = re.sub(r'\r+', '\r', actual)
    -        actual = actual.replace('\r\n', '\n')
    +    actual = cleanup_eol(actual)
     
         return result, actual
     
    @@ -82,9 +87,10 @@ def get_baseline_name(fn):
     
     def get_baseline(fn):
         name = get_baseline_name(fn)
    -    f = codecs.open(name, mode='r', encoding='utf8')
    -    out = f.read()
    -    f.close()
    +    with codecs.open(name, mode='r', encoding='utf8') as f:
    +        out = f.read()
    +    #f.close()
    +    out = cleanup_eol(out)
         return out
     
     
    
    From ee4862068d6e998da889e4adff8dc6053ea207fa Mon Sep 17 00:00:00 2001
    From: Ciprian Miclaus 
    Date: Fri, 21 Apr 2017 16:25:39 +0100
    Subject: [PATCH 346/479] updated AUTHORS and Changelog docs
    
    ---
     AUTHORS.rst   | 1 +
     ChangeLog.rst | 1 +
     2 files changed, 2 insertions(+)
    
    diff --git a/AUTHORS.rst b/AUTHORS.rst
    index 8d9de81c..76852977 100644
    --- a/AUTHORS.rst
    +++ b/AUTHORS.rst
    @@ -21,6 +21,7 @@ The AUTHORS/Contributors are (and/or have been):
     * John C F 
     * Mikhail Melnik 
     * Andres Rey
    +* Ciprian Miclaus
     
     
     Maintainer:
    diff --git a/ChangeLog.rst b/ChangeLog.rst
    index 59a4f594..e3b2fabf 100644
    --- a/ChangeLog.rst
    +++ b/ChangeLog.rst
    @@ -4,6 +4,7 @@
     
     * Fix #157: Fix images link with div wrap
     * Fix #55: Fix error when empty title tags
    +* Fix #160: The html2text tests are failing on Windows and on Cygwin due to differences in eol handling between windows/*nix
     
     
     2016.9.19
    
    From 1f3ccc5a4bebc1d4ead5bf8b0b1f37ea00572575 Mon Sep 17 00:00:00 2001
    From: Ciprian Miclaus 
    Date: Mon, 24 Apr 2017 10:19:58 +0100
    Subject: [PATCH 347/479] cleanup a commented line
    
    ---
     test/test_html2text.py | 1 -
     1 file changed, 1 deletion(-)
    
    diff --git a/test/test_html2text.py b/test/test_html2text.py
    index 5c9c6795..a9c6efc7 100644
    --- a/test/test_html2text.py
    +++ b/test/test_html2text.py
    @@ -89,7 +89,6 @@ def get_baseline(fn):
         name = get_baseline_name(fn)
         with codecs.open(name, mode='r', encoding='utf8') as f:
             out = f.read()
    -    #f.close()
         out = cleanup_eol(out)
         return out
     
    
    From 2a81874e477fdbcd5a1c6282c2bae70d186ab0b5 Mon Sep 17 00:00:00 2001
    From: Ciprian Miclaus 
    Date: Tue, 2 May 2017 13:34:25 +0100
    Subject: [PATCH 348/479] fix flake8 violations (no functional changes)
    
    ---
     html2text/__init__.py  | 101 +++++++++++++++++++++++++++--------------
     html2text/cli.py       |  19 +++++---
     html2text/compat.py    |   4 ++
     html2text/config.py    |   8 +++-
     html2text/utils.py     |   7 +--
     setup.py               |  13 +++++-
     test/test_html2text.py |  20 +++++---
     test/test_memleak.py   |   7 +--
     8 files changed, 122 insertions(+), 57 deletions(-)
    
    diff --git a/html2text/__init__.py b/html2text/__init__.py
    index de94b19c..5ffa7293 100644
    --- a/html2text/__init__.py
    +++ b/html2text/__init__.py
    @@ -182,8 +182,9 @@ def handle_charref(self, c):
     
         def handle_entityref(self, c):
             entityref = self.entityref(c)
    -        if (not self.code and not self.pre
    -                and entityref != ' _place_holder;'):
    +        if not self.code and not \
    +            self.pre and \
    +                entityref != ' _place_holder;':
                 entityref = html_escape(entityref)
             self.handle_data(entityref, True)
     
    @@ -208,10 +209,11 @@ def previousIndex(self, attrs):
                 i += 1
                 match = 0
     
    -            if ('href' in a) and a['href'] == attrs['href']:
    -                if ('title' in a) or ('title' in attrs):
    -                    if (('title' in a) and ('title' in attrs) and
    -                                a['title'] == attrs['title']):
    +            if 'href' in a and a['href'] == attrs['href']:
    +                if 'title' in a or 'title' in attrs:
    +                    if 'title' in a and \
    +                        'title' in attrs and \
    +                            a['title'] == attrs['title']:
                             match = True
                     else:
                         match = True
    @@ -229,8 +231,8 @@ def handle_emphasis(self, start, tag_style, parent_style):
             # handle Google's text emphasis
             strikethrough = 'line-through' in \
                             tag_emphasis and self.hide_strikethrough
    -        bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
    -        italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
    +        bold = 'bold' in tag_emphasis and 'bold' not in parent_emphasis
    +        italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis
             fixed = google_fixed_width_font(tag_style) and not \
                 google_fixed_width_font(parent_style) and not self.pre
     
    @@ -292,10 +294,11 @@ def handle_tag(self, tag, attrs, start):
                 if self.tag_callback(self, tag, attrs, start) is True:
                     return
     
    -        # first thing inside the anchor tag is another tag that produces some output
    -        if (start and not self.maybe_automatic_link is None
    -                and tag not in ['p', 'div', 'style', 'dl', 'dt']
    -                and (tag != "img" or self.ignore_images)):
    +        # first thing inside the anchor tag is another tag
    +        # that produces some output
    +        if (start and self.maybe_automatic_link is not None and
    +                tag not in ['p', 'div', 'style', 'dl', 'dt'] and
    +                (tag != "img" or self.ignore_images)):
                 self.o("[")
                 self.maybe_automatic_link = None
                 self.empty_link = False
    @@ -312,7 +315,8 @@ def handle_tag(self, tag, attrs, start):
                     tag_style = element_style(attrs, self.style_def, parent_style)
                     self.tag_stack.append((tag, attrs, tag_style))
                 else:
    -                dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {})
    +                dummy, attrs, tag_style = self.tag_stack.pop() \
    +                    if self.tag_stack else (None, {}, {})
                     if self.tag_stack:
                         parent_style = self.tag_stack[-1][2]
     
    @@ -404,15 +408,15 @@ def handle_tag(self, tag, attrs, start):
     
             if tag == "a" and not self.ignore_links:
                 if start:
    -                if ('href' in attrs) and \
    -                        (attrs['href'] is not None) and \
    -                        not (self.skip_internal_links and
    -                                 attrs['href'].startswith('#')):
    +                if 'href' in attrs and \
    +                    attrs['href'] is not None and not \
    +                        (self.skip_internal_links and
    +                            attrs['href'].startswith('#')):
                         self.astack.append(attrs)
                         self.maybe_automatic_link = attrs['href']
                         self.empty_link = True
                         if self.protect_links:
    -                        attrs['href'] = '<'+attrs['href']+'>'
    +                        attrs['href'] = '<' + attrs['href'] + '>'
                     else:
                         self.astack.append(None)
                 else:
    @@ -432,10 +436,29 @@ def handle_tag(self, tag, attrs, start):
                                     else:
                                         title = ""
                                 except KeyError:
    -                                self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")")
    +                                self.o(
    +                                    "](" +
    +                                    escape_md(
    +                                        urlparse.urljoin(
    +                                            self.baseurl,
    +                                            a['href']
    +                                        )
    +                                    ) +
    +                                    ")"
    +                                )
                                 else:
    -                                self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href']))
    -                                       + ' "' + title + '" )')
    +                                self.o(
    +                                    "](" +
    +                                    escape_md(
    +                                        urlparse.urljoin(
    +                                            self.baseurl,
    +                                            a['href']
    +                                        )
    +                                    ) +
    +                                    ' "' +
    +                                    title +
    +                                    '" )'
    +                                )
                             else:
                                 i = self.previousIndex(a)
                                 if i is not None:
    @@ -468,7 +491,7 @@ def handle_tag(self, tag, attrs, start):
                         return
     
                     # If we have a link to create, output the start
    -                if not self.maybe_automatic_link is None:
    +                if self.maybe_automatic_link is not None:
                         href = self.maybe_automatic_link
                         if self.images_to_alt and escape_md(alt) == href and \
                                 self.absolute_url_matcher.match(href):
    @@ -488,7 +511,16 @@ def handle_tag(self, tag, attrs, start):
                         self.o("![" + escape_md(alt) + "]")
                         if self.inline_links:
                             href = attrs.get('href') or ''
    -                        self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")")
    +                        self.o(
    +                            "(" +
    +                            escape_md(
    +                                urlparse.urljoin(
    +                                    self.baseurl,
    +                                    href
    +                                )
    +                            ) +
    +                            ")"
    +                        )
                         else:
                             i = self.previousIndex(attrs)
                             if i is not None:
    @@ -581,11 +613,11 @@ def handle_tag(self, tag, attrs, start):
                         if start:
                             self.table_start = True
                             if self.pad_tables:
    -                            self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
    +                            self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
                                 self.o("  \n")
                         else:
                             if self.pad_tables:
    -                            self.o("")
    +                            self.o("")
                                 self.o("  \n")
                     if tag in ["td", "th"] and start:
                         if self.split_next_td:
    @@ -659,8 +691,9 @@ def o(self, data, puredata=0, force=0):
                     return
     
                 if self.startpre:
    -                #self.out(" :") #TODO: not output when already one there
    -                if not data.startswith("\n") and not data.startswith("\r\n"):  # 
    stuff...
    +                # self.out(" :") #TODO: not output when already one there
    +                if not data.startswith("\n") and not data.startswith("\r\n"):
    +                    # 
    stuff...
                         data = "\n" + data
                     if self.mark_code:
                         self.out("\n[code]")
    @@ -673,7 +706,7 @@ def o(self, data, puredata=0, force=0):
                 if self.pre:
                     if not self.list:
                         bq += "    "
    -                #else: list content is already partially indented
    +                # else: list content is already partially indented
                     for i in range(len(self.list)):
                         bq += "    "
                     data = data.replace("\n", "\n" + bq)
    @@ -705,8 +738,8 @@ def o(self, data, puredata=0, force=0):
                         self.out(' ')
                     self.space = 0
     
    -            if self.a and ((self.p_p == 2 and self.links_each_paragraph)
    -                           or force == "end"):
    +            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or
    +                           force == "end"):
                     if force == "end":
                         self.out("\n")
     
    @@ -739,10 +772,10 @@ def handle_data(self, data, entity_char=False):
             if self.style:
                 self.style_def.update(dumb_css_parser(data))
     
    -        if not self.maybe_automatic_link is None:
    +        if self.maybe_automatic_link is not None:
                 href = self.maybe_automatic_link
    -            if (href == data and self.absolute_url_matcher.match(href)
    -                    and self.use_automatic_links):
    +            if (href == data and self.absolute_url_matcher.match(href) and
    +                    self.use_automatic_links):
                     self.o("<" + data + ">")
                     self.empty_link = False
                     return
    @@ -814,7 +847,7 @@ def google_nest_count(self, style):
             nest_count = 0
             if 'margin-left' in style:
                 nest_count = int(style['margin-left'][:-2]) \
    -                         // self.google_list_indent
    +                // self.google_list_indent
     
             return nest_count
     
    diff --git a/html2text/cli.py b/html2text/cli.py
    index c9357879..e155e4fb 100644
    --- a/html2text/cli.py
    +++ b/html2text/cli.py
    @@ -158,7 +158,8 @@ class bcolors:  # pragma: no cover
             action="store_true",
             dest="ignore_tables",
             default=config.IGNORE_TABLES,
    -        help="Ignore table-related tags (table, th, td, tr) while keeping rows."
    +        help="Ignore table-related tags (table, th, td, tr) "
    +             "while keeping rows."
         )
         p.add_option(
             "--single-line-break",
    @@ -211,7 +212,8 @@ class bcolors:  # pragma: no cover
             action="store",
             type="string",
             default=config.DECODE_ERRORS,
    -        help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
    +        help="What to do in case of decode errors.'ignore', 'strict' and "
    +             "'replace' are acceptable values"
         )
         (options, args) = p.parse_args()
     
    @@ -226,8 +228,11 @@ class bcolors:  # pragma: no cover
             file_ = args[0]
     
             if file_.startswith('http://') or file_.startswith('https://'):
    -            warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
    -                    DeprecationWarning)
    +            warnings.warn(
    +                "Support for retrieving html over network is set for "
    +                "deprecation by version (2017, 1, x)",
    +                DeprecationWarning
    +            )
                 baseurl = file_
                 j = urllib.urlopen(baseurl)
                 data = j.read()
    @@ -235,7 +240,8 @@ class bcolors:  # pragma: no cover
                     try:
                         from feedparser import _getCharacterEncoding as enc
                     except ImportError:
    -                    enc = lambda x, y: ('utf-8', 1)
    +                    def enc(x, y):
    +                        return ('utf-8', 1)
                     encoding = enc(j.headers, data)[0]
                     if encoding == 'us-ascii':
                         encoding = 'utf-8'
    @@ -245,7 +251,8 @@ class bcolors:  # pragma: no cover
                     try:
                         from chardet import detect
                     except ImportError:
    -                    detect = lambda x: {'encoding': 'utf-8'}
    +                    def detect(x):
    +                        return {'encoding': 'utf-8'}
                     encoding = detect(data)['encoding']
         else:
             data = wrap_read()
    diff --git a/html2text/compat.py b/html2text/compat.py
    index 60907abf..f669f980 100644
    --- a/html2text/compat.py
    +++ b/html2text/compat.py
    @@ -13,5 +13,9 @@
         import html.parser as HTMLParser
         import urllib.request as urllib
         from html import escape
    +
         def html_escape(s):
             return escape(s, quote=False)
    +
    +
    +__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urllib', 'urlparse']
    diff --git a/html2text/config.py b/html2text/config.py
    index b06e94a4..bde0eaa1 100644
    --- a/html2text/config.py
    +++ b/html2text/config.py
    @@ -41,7 +41,8 @@
     DEFAULT_IMAGE_ALT = ''
     PAD_TABLES = False
     
    -# Convert links with same href and text to  format if they are absolute links
    +# Convert links with same href and text to  format
    +# if they are absolute links
     USE_AUTOMATIC_LINKS = True
     
     # For checking space-only lines on line 771
    @@ -52,7 +53,10 @@
     RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s')
     RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
     RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
    -RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")  # to find links in the text
    +
    +# to find links in the text
    +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
    +
     RE_MD_DOT_MATCHER = re.compile(r"""
         ^             # start of line
         (\s*\d+)      # optional whitespace and a number
    diff --git a/html2text/utils.py b/html2text/utils.py
    index ff510a81..24cdd7ea 100644
    --- a/html2text/utils.py
    +++ b/html2text/utils.py
    @@ -12,7 +12,6 @@ def name2cp(k):
     
     
     unifiable_n = {}
    -
     for k in config.UNIFIABLE.keys():
         unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
     
    @@ -245,6 +244,7 @@ def escape_md_section(text, snob=False):
     
         return text
     
    +
     def reformat_table(lines, right_margin):
         """
         Given the lines of a table
    @@ -256,7 +256,7 @@ def reformat_table(lines, right_margin):
             cols = [x.rstrip() for x in line.split('|')]
             max_width = [max(len(x) + right_margin, old_len)
                          for x, old_len in zip(cols, max_width)]
    -    
    +
         # reformat
         new_lines = []
         for line in lines:
    @@ -272,12 +272,13 @@ def reformat_table(lines, right_margin):
             new_lines.append('|'.join(new_cols))
         return new_lines
     
    +
     def pad_tables_in_text(text, right_margin=1):
         """
         Provide padding for tables in the text
         """
         lines = text.split('\n')
    -    table_buffer, altered_lines, table_widths, table_started = [], [], [], False
    +    table_buffer, table_started = [], False
         new_lines = []
         for line in lines:
             # Toggle table started
    diff --git a/setup.py b/setup.py
    index f8c2d249..5968e748 100644
    --- a/setup.py
    +++ b/setup.py
    @@ -3,11 +3,20 @@
     
     from setuptools import setup, Command, find_packages
     
    +
    +def read_md_convert(f):
    +    return convert(f, 'rst')
    +
    +
    +def read_md_open(f):
    +    return open(f, 'r').read()
    +
    +
     try:
         from pypandoc import convert
    -    read_md = lambda f: convert(f, 'rst')
    +    read_md = read_md_convert
     except ImportError:
    -    read_md = lambda f: open(f, 'r').read()
    +    read_md = read_md_open
     
     requires_list = []
     try:
    diff --git a/test/test_html2text.py b/test/test_html2text.py
    index a9c6efc7..4f69e342 100644
    --- a/test/test_html2text.py
    +++ b/test/test_html2text.py
    @@ -1,5 +1,7 @@
     import codecs
     import glob
    +import html2text
    +import logging
     import os
     import re
     import subprocess
    @@ -9,13 +11,11 @@
         import unittest2 as unittest
     else:
         import unittest
    -import logging
    +
     
     logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                         level=logging.DEBUG)
     
    -import html2text
    -
     
     def cleanup_eol(clean_str):
         if os.name == 'nt' or sys.platform == 'cygwin':
    @@ -26,6 +26,7 @@ def cleanup_eol(clean_str):
             clean_str = clean_str.replace('\r\n', '\n')
         return clean_str
     
    +
     def test_module(fn, google_doc=False, **kwargs):
         h = html2text.HTML2Text()
         h.fn = fn
    @@ -98,19 +99,19 @@ class TestHTML2Text(unittest.TestCase):
     
     
     def generate_test(fn):
    -    def test_mod(self):
    +    def _test_mod(self):
             self.maxDiff = None
             result, actual = test_module(fn, **module_args)
             self.assertEqual(result, actual)
     
    -    def test_cmd(self):
    +    def _test_cmd(self):
             # Because there is no command-line option to control unicode_snob
             if 'unicode_snob' not in module_args:
                 self.maxDiff = None
                 result, actual = test_command(fn, *cmdline_args)
                 self.assertEqual(result, actual)
     
    -    def test_func(self):
    +    def _test_func(self):
             result, actual = test_function(fn, **func_args)
             self.assertEqual(result, actual)
     
    @@ -189,14 +190,19 @@ def test_func(self):
     
         if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']:
             test_func = None
    +    else:
    +        test_func = _test_func
     
         if base_fn == 'inplace_baseurl_substitution.html':
             module_args['baseurl'] = 'http://brettterpstra.com'
             module_args['body_width'] = 0
             # there is no way to specify baseurl in cli :(
             test_cmd = None
    +    else:
    +        test_cmd = _test_cmd
    +
    +    return _test_mod, test_cmd, test_func
     
    -    return test_mod, test_cmd, test_func
     
     # Originally from http://stackoverflow.com/questions/32899/\
     #    how-to-generate-dynamic-parametrized-unit-tests-in-python
    diff --git a/test/test_memleak.py b/test/test_memleak.py
    index 27999fbc..737999c0 100644
    --- a/test/test_memleak.py
    +++ b/test/test_memleak.py
    @@ -1,14 +1,15 @@
    +import html2text
    +import logging
     import sys
     if sys.version_info[:2] < (2, 7):
         import unittest2 as unittest
     else:
         import unittest
    -import logging
    +
    +
     logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                         level=logging.DEBUG)
     
    -import html2text
    -
     
     class TestMemleak(unittest.TestCase):
         """
    
    From a6606e9f864976ecea8f1f7db8bb8168fa74e91a Mon Sep 17 00:00:00 2001
    From: Ciprian Miclaus 
    Date: Tue, 2 May 2017 13:34:49 +0100
    Subject: [PATCH 349/479] add flake8 to the travis build
    
    ---
     .travis.yml          | 4 ++++
     ChangeLog.rst        | 1 +
     requirements-dev.txt | 1 +
     3 files changed, 6 insertions(+)
    
    diff --git a/.travis.yml b/.travis.yml
    index 4d592e83..fbaa9117 100644
    --- a/.travis.yml
    +++ b/.travis.yml
    @@ -3,14 +3,18 @@ python:
       - "2.6"
       - "2.7"
       - "pypy"
    +  - "pypy3"
       - "3.2"
       - "3.3"
       - "3.4"
       - "3.5"
    +  - "3.6"
     install:
         - pip install coveralls==0.5
     before_script:
       - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi
    +  # flake8 doesn't support Python 2.6
    +  - if [[ $TRAVIS_PYTHON_VERSION != '2.6' ]]; then pip install flake8==3.3.0;flake8 .; fi
       - export COVERAGE_PROCESS_START=$PWD/.coveragerc
     script:
       - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text --rcfile=.coveragerc setup.py test -v
    diff --git a/ChangeLog.rst b/ChangeLog.rst
    index e3b2fabf..bf634cf8 100644
    --- a/ChangeLog.rst
    +++ b/ChangeLog.rst
    @@ -5,6 +5,7 @@
     * Fix #157: Fix images link with div wrap
     * Fix #55: Fix error when empty title tags
     * Fix #160: The html2text tests are failing on Windows and on Cygwin due to differences in eol handling between windows/*nix
    +* Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build
     
     
     2016.9.19
    diff --git a/requirements-dev.txt b/requirements-dev.txt
    index a5b00a3f..e9fdc438 100644
    --- a/requirements-dev.txt
    +++ b/requirements-dev.txt
    @@ -2,3 +2,4 @@ coverage==4.0.3
     py==1.4.31
     pypandoc==1.1.3
     wheel==0.24.0
    +flake8==3.3.0
    
    From 1d2b43787653adade63e786c417054c201ec05d1 Mon Sep 17 00:00:00 2001
    From: Ciprian Miclaus 
    Date: Wed, 14 Jun 2017 22:11:42 +0100
    Subject: [PATCH 350/479] Use setuptools<30 on Py3.2 as support was dropped in
     later versions (#168)
    
    * downgrade setuptools<30 for Py3.2
    * downgrade setuptools<30 for pypy3
    ---
     .travis.yml | 3 +++
     1 file changed, 3 insertions(+)
    
    diff --git a/.travis.yml b/.travis.yml
    index fbaa9117..e6ecf19e 100644
    --- a/.travis.yml
    +++ b/.travis.yml
    @@ -9,6 +9,9 @@ python:
       - "3.4"
       - "3.5"
       - "3.6"
    +before_install:
    +  # Python 3.2 needs setuptools < 30 - support for Py3.2 was dropped after that version
    +  - if [[ $TRAVIS_PYTHON_VERSION == '3.2' || $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then pip install -U 'setuptools<30'; fi
     install:
         - pip install coveralls==0.5
     before_script:
    
    From ba032c53d74c06bb7bf907ce08f525dd26d52592 Mon Sep 17 00:00:00 2001
    From: Ciprian Miclaus 
    Date: Thu, 15 Jun 2017 23:55:16 +0100
    Subject: [PATCH 351/479] Fix #109: Fix for unexpanded < > & (#169)
    
    ---
     ChangeLog.rst           |  1 +
     html2text/__init__.py   | 14 +++-----------
     test/html-escaping.html |  4 ++--
     test/html-escaping.md   |  4 ++--
     test/test_html2text.py  | 34 +++++++++++++++++++++++++++++++++-
     5 files changed, 41 insertions(+), 16 deletions(-)
    
    diff --git a/ChangeLog.rst b/ChangeLog.rst
    index bf634cf8..2ae2ba30 100644
    --- a/ChangeLog.rst
    +++ b/ChangeLog.rst
    @@ -6,6 +6,7 @@
     * Fix #55: Fix error when empty title tags
     * Fix #160: The html2text tests are failing on Windows and on Cygwin due to differences in eol handling between windows/*nix
     * Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build
    +* Fix #109: Fix for unexpanded < > &
     
     
     2016.9.19
    diff --git a/html2text/__init__.py b/html2text/__init__.py
    index 5ffa7293..5ae2192c 100644
    --- a/html2text/__init__.py
    +++ b/html2text/__init__.py
    @@ -10,7 +10,7 @@
     except ImportError:  # pragma: no cover
         pass
     
    -from html2text.compat import urlparse, HTMLParser, html_escape
    +from html2text.compat import urlparse, HTMLParser
     from html2text import config
     
     from html2text.utils import (
    @@ -175,18 +175,10 @@ def close(self):
             return outtext
     
         def handle_charref(self, c):
    -        charref = self.charref(c)
    -        if not self.code and not self.pre:
    -            charref = html_escape(charref)
    -        self.handle_data(charref, True)
    +        self.handle_data(self.charref(c), True)
     
         def handle_entityref(self, c):
    -        entityref = self.entityref(c)
    -        if not self.code and not \
    -            self.pre and \
    -                entityref != ' _place_holder;':
    -            entityref = html_escape(entityref)
    -        self.handle_data(entityref, True)
    +        self.handle_data(self.entityref(c), True)
     
         def handle_starttag(self, tag, attrs):
             self.handle_tag(tag, attrs, 1)
    diff --git a/test/html-escaping.html b/test/html-escaping.html
    index b6f1da74..9d805b03 100644
    --- a/test/html-escaping.html
    +++ b/test/html-escaping.html
    @@ -1,3 +1,3 @@
    -

    Escaped HTML like <div> or & should remain escaped on output

    -
    ...unless that escaped HTML is in a <pre> tag
    +

    Escaped HTML like <div> or & should NOT remain escaped on output

    +
    ...even when that escaped HTML is in a <pre> tag
    ...or a <code> tag \ No newline at end of file diff --git a/test/html-escaping.md b/test/html-escaping.md index 19e91eeb..41a318b7 100644 --- a/test/html-escaping.md +++ b/test/html-escaping.md @@ -1,8 +1,8 @@ -Escaped HTML like <div> or & should remain escaped on output +Escaped HTML like
    or & should NOT remain escaped on output - ...unless that escaped HTML is in a
     tag
    +    ...even when that escaped HTML is in a 
     tag
     
     `...or a  tag`
     
    diff --git a/test/test_html2text.py b/test/test_html2text.py
    index 4f69e342..4d8cd2e9 100644
    --- a/test/test_html2text.py
    +++ b/test/test_html2text.py
    @@ -95,7 +95,39 @@ def get_baseline(fn):
     
     
     class TestHTML2Text(unittest.TestCase):
    -    pass
    +
    +    def test_html_escape(self):
    +        self.assertEqual(
    +            html2text.compat.html_escape('
    and then
    & other tags'), + '<pre>and then<div> & other tags' + ) + + def test_unescape(self): + self.assertEqual( + '
    and then
    & other tags', + html2text.unescape( + '<pre>and then<div> & other tags' + ) + ) + + def _skip_certain_tags(self, h2t, tag, attrs, start): + if tag == 'b': + return True + + def test_tag_callback(self): + h = html2text.HTML2Text() + h.tag_callback = self._skip_certain_tags + ret = h.handle( + 'this is a txt and this is a' + ' with text and ' + 'some italics too.' + ) + self.assertEqual( + ret, + 'this is a txt and this is a' + ' with text and ' + 'some _italics_ too.\n\n' + ) def generate_test(fn): From 7ae0a2faef2ebf1197d71927dd8924c9a71cf05a Mon Sep 17 00:00:00 2001 From: Toshihiro Kamiya Date: Sun, 25 Jun 2017 20:33:43 +0900 Subject: [PATCH 352/479] Fix #143: Fix line wrapping for the lines starting with **bold** --- AUTHORS.rst | 2 +- ChangeLog.rst | 1 + html2text/utils.py | 2 +- test/bold_long_line.html | 3 +++ test/bold_long_line.md | 3 +++ 5 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 test/bold_long_line.html create mode 100644 test/bold_long_line.md diff --git a/AUTHORS.rst b/AUTHORS.rst index 76852977..152762c6 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -22,7 +22,7 @@ The AUTHORS/Contributors are (and/or have been): * Mikhail Melnik * Andres Rey * Ciprian Miclaus - +* Toshihiro Kamiya Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 2ae2ba30..0b89ebd4 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -7,6 +7,7 @@ * Fix #160: The html2text tests are failing on Windows and on Cygwin due to differences in eol handling between windows/*nix * Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build * Fix #109: Fix for unexpanded < > & +* Fix #143: Fix line wrapping for the lines starting with bold 2016.9.19 diff --git a/html2text/utils.py b/html2text/utils.py index 24cdd7ea..ebcb34c3 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -190,7 +190,7 @@ def skipwrap(para, wrap_links): # I'm not sure what this is for; I thought it was to detect lists, # but there's a
    -inside- case in one of the tests that # also depends upon it. - if stripped[0:1] == '-' or stripped[0:1] == '*': + if stripped[0:1] == '-' or (stripped[0:1] == '*' and not stripped[0:2] == '**'): return True # If the text begins with a single -, *, or +, followed by a space, diff --git a/test/bold_long_line.html b/test/bold_long_line.html new file mode 100644 index 00000000..2aba478d --- /dev/null +++ b/test/bold_long_line.html @@ -0,0 +1,3 @@ +

    +text and a very long long long long long long long long long long long long long long long long long long long long line +

    diff --git a/test/bold_long_line.md b/test/bold_long_line.md new file mode 100644 index 00000000..fc1119ed --- /dev/null +++ b/test/bold_long_line.md @@ -0,0 +1,3 @@ +**text** and a very long long long long long long long long long long long +long long long long long long long long long line + From 513bd4f3f3867490963c99e0d7a713d2cedd4ebd Mon Sep 17 00:00:00 2001 From: Toshihiro Kamiya Date: Mon, 26 Jun 2017 05:27:12 +0900 Subject: [PATCH 353/479] pep8 issue --- html2text/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index ebcb34c3..dc5955f6 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -190,7 +190,7 @@ def skipwrap(para, wrap_links): # I'm not sure what this is for; I thought it was to detect lists, # but there's a
    -inside- case in one of the tests that # also depends upon it. - if stripped[0:1] == '-' or (stripped[0:1] == '*' and not stripped[0:2] == '**'): + if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**': return True # If the text begins with a single -, *, or +, followed by a space, From 0d93c8070b25b2523cb85f8e4016a172e83e087c Mon Sep 17 00:00:00 2001 From: Matt Dennewitz Date: Thu, 27 Jul 2017 18:24:48 -0500 Subject: [PATCH 354/479] adds support for "700", "800", and "900" as bold style indicators google, and presumably others, use `font-weight: 700` to make bold a region of text. this pr polls tag styles for both `bold` and `700` (via new config value, `BOLD_TEXT_STYLE_VALUES`) to be certain. --- AUTHORS.rst | 1 + ChangeLog.rst | 6 ++++++ html2text/__init__.py | 10 +++++++++- html2text/config.py | 3 +++ test/google-like_font-properties.html | 6 ++++++ test/google-like_font-properties.md | 6 ++++++ 6 files changed, 31 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 152762c6..c2d5aa41 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -23,6 +23,7 @@ The AUTHORS/Contributors are (and/or have been): * Andres Rey * Ciprian Miclaus * Toshihiro Kamiya +* Matt Dennewitz Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 0b89ebd4..9656ffad 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -9,6 +9,12 @@ * Fix #109: Fix for unexpanded < > & * Fix #143: Fix line wrapping for the lines starting with bold +2017.7.27 +========= +---- + +* Adds support for numeric bold text indication in `font-weight`, + as used by Google (and presumably others.) 2016.9.19 ========= diff --git a/html2text/__init__.py b/html2text/__init__.py index 5ae2192c..11ffd398 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -223,7 +223,15 @@ def handle_emphasis(self, start, tag_style, parent_style): # handle Google's text emphasis strikethrough = 'line-through' in \ tag_emphasis and self.hide_strikethrough - bold = 'bold' in tag_emphasis and 'bold' not in parent_emphasis + + # google and others may mark a font's weight as `bold` or `700` + bold = False + for bold_marker in config.BOLD_TEXT_STYLE_VALUES: + bold = (bold_marker in tag_emphasis + and bold_marker not in parent_emphasis) + if bold is True: + break + italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis fixed = google_fixed_width_font(tag_style) and not \ google_fixed_width_font(parent_style) and not self.pre diff --git a/html2text/config.py b/html2text/config.py index bde0eaa1..d0656b0a 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -31,6 +31,9 @@ # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 +# Values Google and others may use to indicate bold text +BOLD_TEXT_STYLE_VALUES = ('bold', '700', '800', '900') + IGNORE_ANCHORS = False IGNORE_IMAGES = False IMAGES_TO_ALT = False diff --git a/test/google-like_font-properties.html b/test/google-like_font-properties.html index cff3ebd6..6f330b6b 100644 --- a/test/google-like_font-properties.html +++ b/test/google-like_font-properties.html @@ -5,6 +5,12 @@

    font-weight: bold

    FONT-WEIGHT: BOLD

    +

    font-weight: 700

    +

    FONT-WEIGHT: 700

    +

    font-weight: 800

    +

    FONT-WEIGHT: 800

    +

    font-weight: 900

    +

    FONT-WEIGHT: 900

    font-style: italic

    FONT-STYLE: ITALIC

    diff --git a/test/google-like_font-properties.md b/test/google-like_font-properties.md index c7a0312d..c8a2e9c6 100644 --- a/test/google-like_font-properties.md +++ b/test/google-like_font-properties.md @@ -1,5 +1,11 @@ **font-weight: bold** **FONT-WEIGHT: BOLD** +**font-weight: 700** +**FONT-WEIGHT: 700** +**font-weight: 800** +**FONT-WEIGHT: 800** +**font-weight: 900** +**FONT-WEIGHT: 900** _font-style: italic_ _FONT-STYLE: ITALIC_ _**font-weight: bold;font-style: italic**_ From 90762734002a49fc523ebbf32bac1d7c1f684695 Mon Sep 17 00:00:00 2001 From: Matt Dennewitz Date: Fri, 28 Jul 2017 10:48:51 -0500 Subject: [PATCH 355/479] moves changelog note into `0000.00.00` range. bold check made implicit. --- ChangeLog.rst | 6 +----- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 9656ffad..9f868d61 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -8,14 +8,10 @@ * Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build * Fix #109: Fix for unexpanded < > & * Fix #143: Fix line wrapping for the lines starting with bold - -2017.7.27 -========= ----- - * Adds support for numeric bold text indication in `font-weight`, as used by Google (and presumably others.) + 2016.9.19 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 11ffd398..b82c800d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -229,7 +229,7 @@ def handle_emphasis(self, start, tag_style, parent_style): for bold_marker in config.BOLD_TEXT_STYLE_VALUES: bold = (bold_marker in tag_emphasis and bold_marker not in parent_emphasis) - if bold is True: + if bold: break italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis From b2765e279e1218104df0d90e6e4fc00134b7108d Mon Sep 17 00:00:00 2001 From: Jonathan Sundqvist Date: Fri, 11 Aug 2017 23:26:42 +0200 Subject: [PATCH 356/479] Fix issue with emphasis and whitespace --- AUTHORS.rst | 1 + html2text/__init__.py | 51 ++++++++++++++++++++++++++++++++---- test/edge_case_emphasis.html | 7 +++++ test/edge_case_emphasis.md | 14 ++++++++++ 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 test/edge_case_emphasis.html create mode 100644 test/edge_case_emphasis.md diff --git a/AUTHORS.rst b/AUTHORS.rst index 152762c6..821c0dd0 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -23,6 +23,7 @@ The AUTHORS/Contributors are (and/or have been): * Andres Rey * Ciprian Miclaus * Toshihiro Kamiya +* Jonathan Sundqvist Maintainer: diff --git a/html2text/__init__.py b/html2text/__init__.py index 5ae2192c..4c571508 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -119,6 +119,10 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl + self.stressed = False + self.preceding_stressed = False + self.preceding_data = None + self.current_tag = None try: del unifiable_n[name2cp('nbsp')] @@ -276,6 +280,7 @@ def handle_emphasis(self, start, tag_style, parent_style): self.quiet -= 1 def handle_tag(self, tag, attrs, start): + self.current_tag = tag # attrs is None for endtags if attrs is None: attrs = {} @@ -368,15 +373,37 @@ def handle_tag(self, tag, attrs, start): self.blockquote -= 1 self.p() + def no_preceding_space(self): + if self.preceding_data and re.match(r'[^\s]', self.preceding_data[-1]): + return True + if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: - self.o(self.emphasis_mark) + if start and no_preceding_space(self): + emphasis = ' ' + self.emphasis_mark + else: + emphasis = self.emphasis_mark + + self.o(emphasis) + if start: + self.stressed = True if tag in ['strong', 'b'] and not self.ignore_emphasis: - self.o(self.strong_mark) - if tag in ['del', 'strike', 's']: + if start and no_preceding_space(self): + strong = ' ' + self.strong_mark + else: + strong = self.strong_mark + + self.o(strong) if start: - self.o('~~') + self.stressed = True + if tag in ['del', 'strike', 's']: + if start and no_preceding_space(self): + strike = ' ~~' else: - self.o('~~') + strike = '~~' + + self.o(strike) + if start: + self.stressed = True if self.google_doc: if not self.inheader: @@ -761,6 +788,19 @@ def o(self, data, puredata=0, force=0): self.outcount += 1 def handle_data(self, data, entity_char=False): + + if self.stressed: + data = data.strip() + self.stressed = False + self.preceding_stressed = True + elif (self.preceding_stressed + and re.match(r'[^\s.!?]', data[0]) + and not hn(self.current_tag) + and self.current_tag not in ['a', 'code', 'pre']): + # should match a letter or common punctuation + data = ' ' + data + self.preceding_stressed = False + if self.style: self.style_def.update(dumb_css_parser(data)) @@ -778,6 +818,7 @@ def handle_data(self, data, entity_char=False): if not self.code and not self.pre and not entity_char: data = escape_md_section(data, snob=self.escape_snob) + self.preceding_data = data self.o(data, 1) def unknown_decl(self, data): # pragma: no cover diff --git a/test/edge_case_emphasis.html b/test/edge_case_emphasis.html new file mode 100644 index 00000000..99d14008 --- /dev/null +++ b/test/edge_case_emphasis.html @@ -0,0 +1,7 @@ +

    emphasis

    +

    emphasis: some text

    +

    repeat: again

    +

    separate emphasis some more text

    +

    emphasis.

    +

    emphasis?

    +

    emphasis!

    diff --git a/test/edge_case_emphasis.md b/test/edge_case_emphasis.md new file mode 100644 index 00000000..5ec339e7 --- /dev/null +++ b/test/edge_case_emphasis.md @@ -0,0 +1,14 @@ +_emphasis_ + +_emphasis:_ some text + +_repeat:_ again + +separate _emphasis_ some more text + +_emphasis_. + +_emphasis_? + +_emphasis_! + From a18c358d2f3b6fa0e7a991b65007529e62a11bd7 Mon Sep 17 00:00:00 2001 From: Jonathan Sundqvist Date: Sat, 12 Aug 2017 00:40:59 +0200 Subject: [PATCH 357/479] Flake fix --- html2text/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1116b506..b2507560 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -382,7 +382,8 @@ def handle_tag(self, tag, attrs, start): self.p() def no_preceding_space(self): - if self.preceding_data and re.match(r'[^\s]', self.preceding_data[-1]): + if (self.preceding_data + and re.match(r'[^\s]', self.preceding_data[-1])): return True if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: From 5aa7c1103f0bfbf7f1dbb5ea359f9441699e055a Mon Sep 17 00:00:00 2001 From: Jonathan Sundqvist Date: Sat, 12 Aug 2017 20:22:35 +0200 Subject: [PATCH 358/479] Better styling and increase test cases --- ChangeLog.rst | 1 + html2text/__init__.py | 8 ++++---- test/edge_case_emphasis.html | 7 ------- test/emphasis_preserved_whitespace.html | 20 +++++++++++++++++++ ...is.md => emphasis_preserved_whitespace.md} | 14 +++++++++++++ 5 files changed, 39 insertions(+), 11 deletions(-) delete mode 100644 test/edge_case_emphasis.html create mode 100644 test/emphasis_preserved_whitespace.html rename test/{edge_case_emphasis.md => emphasis_preserved_whitespace.md} (52%) diff --git a/ChangeLog.rst b/ChangeLog.rst index 9f868d61..e7f11e90 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -10,6 +10,7 @@ * Fix #143: Fix line wrapping for the lines starting with bold * Adds support for numeric bold text indication in `font-weight`, as used by Google (and presumably others.) +* Fix #173 and #142: Stripping whitespace in crucial markdown and adding whitespace as necessary 2016.9.19 diff --git a/html2text/__init__.py b/html2text/__init__.py index b2507560..6398baea 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -382,9 +382,8 @@ def handle_tag(self, tag, attrs, start): self.p() def no_preceding_space(self): - if (self.preceding_data - and re.match(r'[^\s]', self.preceding_data[-1])): - return True + return (self.preceding_data + and re.match(r'[^\s]', self.preceding_data[-1])) if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: if start and no_preceding_space(self): @@ -395,6 +394,7 @@ def no_preceding_space(self): self.o(emphasis) if start: self.stressed = True + if tag in ['strong', 'b'] and not self.ignore_emphasis: if start and no_preceding_space(self): strong = ' ' + self.strong_mark @@ -404,6 +404,7 @@ def no_preceding_space(self): self.o(strong) if start: self.stressed = True + if tag in ['del', 'strike', 's']: if start and no_preceding_space(self): strike = ' ~~' @@ -797,7 +798,6 @@ def o(self, data, puredata=0, force=0): self.outcount += 1 def handle_data(self, data, entity_char=False): - if self.stressed: data = data.strip() self.stressed = False diff --git a/test/edge_case_emphasis.html b/test/edge_case_emphasis.html deleted file mode 100644 index 99d14008..00000000 --- a/test/edge_case_emphasis.html +++ /dev/null @@ -1,7 +0,0 @@ -

    emphasis

    -

    emphasis: some text

    -

    repeat: again

    -

    separate emphasis some more text

    -

    emphasis.

    -

    emphasis?

    -

    emphasis!

    diff --git a/test/emphasis_preserved_whitespace.html b/test/emphasis_preserved_whitespace.html new file mode 100644 index 00000000..7fcd9b83 --- /dev/null +++ b/test/emphasis_preserved_whitespace.html @@ -0,0 +1,20 @@ +

    emphasis

    +

    emphasis: some text

    +

    repeat: again

    + +

    bold

    +

    bold: some text

    +

    repeat: again

    + +

    strike

    +

    strike: some text

    +

    strike: again

    + +

    separate emphasis some more text

    + + +

    emphasis.

    +

    emphasis?

    +

    emphasis!

    + +

    em1em2

    diff --git a/test/edge_case_emphasis.md b/test/emphasis_preserved_whitespace.md similarity index 52% rename from test/edge_case_emphasis.md rename to test/emphasis_preserved_whitespace.md index 5ec339e7..42bb9161 100644 --- a/test/edge_case_emphasis.md +++ b/test/emphasis_preserved_whitespace.md @@ -4,6 +4,18 @@ _emphasis:_ some text _repeat:_ again +**bold** + +**bold:** some text + +**repeat:** again + +~~strike~~ + +~~strike:~~ some text + +~~strike:~~ again + separate _emphasis_ some more text _emphasis_. @@ -12,3 +24,5 @@ _emphasis_? _emphasis_! +_em1_ _em2_ + From 652d4f35063001143607e1be3ccf5d283a21b778 Mon Sep 17 00:00:00 2001 From: Jonathan Sundqvist Date: Mon, 14 Aug 2017 19:17:30 +0200 Subject: [PATCH 359/479] Refactor imports and title for links --- html2text/__init__.py | 64 ++++++++++++++--------------------------- test/empty-title-tag.md | 2 +- test/link_titles.md | 2 +- 3 files changed, 23 insertions(+), 45 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6398baea..983deab0 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -30,6 +30,13 @@ pad_tables_in_text ) +try: + chr = unichr + nochr = unicode('') +except NameError: + # python3 uses chr + nochr = str('') + __version__ = (2016, 9, 19) @@ -151,22 +158,15 @@ def outtextf(self, s): def close(self): HTMLParser.HTMLParser.close(self) - try: - nochr = unicode('') - unicode_character = unichr - except NameError: - nochr = str('') - unicode_character = chr - self.pbr() self.o('', 0, 'end') outtext = nochr.join(self.outtextlist) if self.unicode_snob: - nbsp = unicode_character(name2cp('nbsp')) + nbsp = chr(name2cp('nbsp')) else: - nbsp = unicode_character(32) + nbsp = chr(32) try: outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: @@ -435,6 +435,12 @@ def no_preceding_space(self): self.abbr_title = None self.abbr_data = '' + def link_url(self, link, title=""): + url = urlparse.urljoin(self.baseurl, link) + title = ' "{0}"'.format(title) if title.strip() else '' + self.o(']({url}{title})'.format(url=escape_md(url), + title=title)) + if tag == "a" and not self.ignore_links: if start: if 'href' in attrs and \ @@ -460,34 +466,12 @@ def no_preceding_space(self): self.maybe_automatic_link = None if self.inline_links: try: - if a['title'] is not None: - title = escape_md(a['title']) - else: - title = "" + title = a['title'] if a['title'] else '' + title = escape_md(title) except KeyError: - self.o( - "](" + - escape_md( - urlparse.urljoin( - self.baseurl, - a['href'] - ) - ) + - ")" - ) + link_url(self, a['href'], '') else: - self.o( - "](" + - escape_md( - urlparse.urljoin( - self.baseurl, - a['href'] - ) - ) + - ' "' + - title + - '" )' - ) + link_url(self, a['href'], title) else: i = self.previousIndex(a) if i is not None: @@ -844,10 +828,7 @@ def charref(self, name): return unifiable_n[c] else: try: - try: - return unichr(c) - except NameError: # Python3 - return chr(c) + return chr(c) except ValueError: # invalid unicode return '' @@ -863,10 +844,7 @@ def entityref(self, c): if c == 'nbsp': return config.UNIFIABLE[c] else: - try: - return unichr(name2cp(c)) - except NameError: # Python3 - return chr(name2cp(c)) + return chr(name2cp(c)) def replaceEntities(self, s): s = s.group(1) diff --git a/test/empty-title-tag.md b/test/empty-title-tag.md index 0dc02eda..ce342288 100644 --- a/test/empty-title-tag.md +++ b/test/empty-title-tag.md @@ -1,2 +1,2 @@ -[This is an A tag with an empty title property](test.html "" ) +[This is an A tag with an empty title property](test.html) diff --git a/test/link_titles.md b/test/link_titles.md index d68fb7e8..930c691d 100644 --- a/test/link_titles.md +++ b/test/link_titles.md @@ -1,3 +1,3 @@ -[ first example](http://example.com "MyTitle" ) +[ first example](http://example.com "MyTitle") [ second example](http://example.com) From d9a372635dd3c2fdd8d57ab475ebaef300c6904d Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Wed, 6 Sep 2017 12:33:29 +1000 Subject: [PATCH 360/479] semi-gracefully handle colspan attributes on tables when padding --- html2text/utils.py | 10 ++++++++++ test/pad_table.html | 15 ++++++++++++++- test/pad_table.md | 9 ++++++++- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/html2text/utils.py b/html2text/utils.py index dc5955f6..72004223 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -254,6 +254,16 @@ def reformat_table(lines, right_margin): max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] for line in lines: cols = [x.rstrip() for x in line.split('|')] + + # don't drop any data if colspan attributes result in unequal lengths + if len(cols) < len(max_width): + cols += [''] * (len(max_width) - len(cols)) + elif len(max_width) < len(cols): + max_width += [ + len(x) + right_margin for x in + cols[-(len(cols) - len(max_width)):] + ] + max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)] diff --git a/test/pad_table.html b/test/pad_table.html index d2966aae..7d3b733a 100644 --- a/test/pad_table.html +++ b/test/pad_table.html @@ -22,5 +22,18 @@ Content 1 Content 2 longer 200 Image! -something else entirely +something else entirely
    + + + + + + + + + + + +
    OneTwoThree
    ABC
    AB+C
    A+BC
    A+B+C
    + diff --git a/test/pad_table.md b/test/pad_table.md index 70ad82f4..0c82aa54 100644 --- a/test/pad_table.md +++ b/test/pad_table.md @@ -24,5 +24,12 @@ Header 1 | Header 2 | Header 3 Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! -something else entirely +something else entirely +One | Two | Three +------|-----|------- +A | B | C +A | B+C +A+B | C +A+B+C + From b96b3887041f7b753c0944ca32f939620d922eca Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Wed, 6 Sep 2017 12:39:20 +1000 Subject: [PATCH 361/479] update authors/changelog as per contribution guidelines --- AUTHORS.rst | 1 + ChangeLog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 6222cc33..5c7a4714 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -25,6 +25,7 @@ The AUTHORS/Contributors are (and/or have been): * Toshihiro Kamiya * Matt Dennewitz * Jonathan Sundqvist +* Simon Meers Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index e7f11e90..d4f67384 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -11,6 +11,7 @@ * Adds support for numeric bold text indication in `font-weight`, as used by Google (and presumably others.) * Fix #173 and #142: Stripping whitespace in crucial markdown and adding whitespace as necessary +* Don't drop any cell data on tables uneven row lengths (e.g. colspan in use) 2016.9.19 From 686e7bae400e217258276d44c461d0f6e9beb859 Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Wed, 6 Sep 2017 12:51:46 +1000 Subject: [PATCH 362/479] include test for shorter opening row --- test/pad_table.html | 12 ++++++++++++ test/pad_table.md | 7 +++++++ 2 files changed, 19 insertions(+) diff --git a/test/pad_table.html b/test/pad_table.html index 7d3b733a..3fdd40c0 100644 --- a/test/pad_table.html +++ b/test/pad_table.html @@ -36,4 +36,16 @@ + + + + + + + + + + +
    One+TwoThree
    ABC
    AB+C
    A+BC
    A+B+C
    + diff --git a/test/pad_table.md b/test/pad_table.md index 0c82aa54..167fc6ec 100644 --- a/test/pad_table.md +++ b/test/pad_table.md @@ -32,4 +32,11 @@ A | B+C A+B | C A+B+C +One+Two | Three +--------|------- +A | B | C +A | B+C +A+B | C +A+B+C + From fc88ba906e0c2e2aecba344eac291a83e08b40cb Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Fri, 8 Sep 2017 12:16:07 +1000 Subject: [PATCH 363/479] remove redundant length calculations. See #183 --- html2text/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/html2text/utils.py b/html2text/utils.py index 72004223..d382e00f 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -252,17 +252,20 @@ def reformat_table(lines, right_margin): """ # find the maximum width of the columns max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] + max_cols = len(max_width) for line in lines: cols = [x.rstrip() for x in line.split('|')] + num_cols = len(cols) # don't drop any data if colspan attributes result in unequal lengths - if len(cols) < len(max_width): - cols += [''] * (len(max_width) - len(cols)) - elif len(max_width) < len(cols): + if num_cols < max_cols: + cols += [''] * (max_cols - num_cols) + elif max_cols < num_cols: max_width += [ len(x) + right_margin for x in - cols[-(len(cols) - len(max_width)):] + cols[-(num_cols - max_cols):] ] + max_cols = num_cols max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)] From e330da7ee28a9883bab28db57abe5ab7a2349998 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 4 Oct 2017 06:37:37 +0000 Subject: [PATCH 364/479] Update the version to the latest and getting it ready to release --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- requirements-dev.txt | 10 +++++----- setup.py | 1 + 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index d4f67384..19917e5d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.00.00 +2017.10.4 ========== ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 983deab0..20ef3544 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -37,7 +37,7 @@ # python3 uses chr nochr = str('') -__version__ = (2016, 9, 19) +__version__ = (2017, 10, 4) # TODO: diff --git a/requirements-dev.txt b/requirements-dev.txt index e9fdc438..800eda13 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -coverage==4.0.3 -py==1.4.31 -pypandoc==1.1.3 -wheel==0.24.0 -flake8==3.3.0 +coverage==4.4.1 +py==1.4.34 +pypandoc==1.4 +wheel==0.30.0 +flake8==3.4.1 diff --git a/setup.py b/setup.py index 5968e748..2541f542 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ def run(self): 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', ], entry_points=""" [console_scripts] From ce2c7e4388196b0589a0e551359eff2eb0506437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=9C=E9=9B=80?= Date: Wed, 1 Nov 2017 20:12:39 +0800 Subject: [PATCH 365/479] fix bug #188 --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 20ef3544..d7402a06 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -437,8 +437,8 @@ def no_preceding_space(self): def link_url(self, link, title=""): url = urlparse.urljoin(self.baseurl, link) - title = ' "{0}"'.format(title) if title.strip() else '' - self.o(']({url}{title})'.format(url=escape_md(url), + title = u' "{0}"'.format(title) if title.strip() else '' + self.o(u']({url}{title})'.format(url=escape_md(url), title=title)) if tag == "a" and not self.ignore_links: From 97b9d2fdda0065c43067ae1169f3c109e1458120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=9C=E9=9B=80?= Date: Wed, 1 Nov 2017 22:07:23 +0800 Subject: [PATCH 366/479] fix visual indent --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index d7402a06..11bbcaa8 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -439,7 +439,7 @@ def link_url(self, link, title=""): url = urlparse.urljoin(self.baseurl, link) title = u' "{0}"'.format(title) if title.strip() else '' self.o(u']({url}{title})'.format(url=escape_md(url), - title=title)) + title=title)) if tag == "a" and not self.ignore_links: if start: From d88efc1475997fc9332cb924587d574aa264179a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=9C=E9=9B=80?= Date: Fri, 24 Nov 2017 16:18:56 +0800 Subject: [PATCH 367/479] import unicode_literals --- html2text/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 11bbcaa8..76114055 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -2,6 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division +from __future__ import unicode_literals import re import sys @@ -437,9 +438,9 @@ def no_preceding_space(self): def link_url(self, link, title=""): url = urlparse.urljoin(self.baseurl, link) - title = u' "{0}"'.format(title) if title.strip() else '' - self.o(u']({url}{title})'.format(url=escape_md(url), - title=title)) + title = ' "{0}"'.format(title) if title.strip() else '' + self.o(']({url}{title})'.format(url=escape_md(url), + title=title)) if tag == "a" and not self.ignore_links: if start: From b7d4758bb2a5ed368246d3f69c82014fa2aa8df6 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sun, 24 Dec 2017 21:25:06 -0600 Subject: [PATCH 368/479] Support the tag --- AUTHORS.rst | 1 + ChangeLog.rst | 7 +++++++ html2text/__init__.py | 3 ++- test/kbd_tag.html | 1 + test/kbd_tag.md | 2 ++ 5 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 test/kbd_tag.html create mode 100644 test/kbd_tag.md diff --git a/AUTHORS.rst b/AUTHORS.rst index 5c7a4714..488eeec0 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -26,6 +26,7 @@ The AUTHORS/Contributors are (and/or have been): * Matt Dennewitz * Jonathan Sundqvist * Simon Meers +* Kurt McKee Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 19917e5d..8395d981 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +0000.00.00 +========== +---- + +* Add support for the tag. + + 2017.10.4 ========== ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 76114055..2acbfe41 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -421,9 +421,10 @@ def no_preceding_space(self): # handle some font attributes, but leave headers clean self.handle_emphasis(start, tag_style, parent_style) - if tag in ["code", "tt"] and not self.pre: + if tag in ["kbd", "code", "tt"] and not self.pre: self.o('`') # TODO: `` `this` `` self.code = not self.code + if tag == "abbr": if start: self.abbr_title = None diff --git a/test/kbd_tag.html b/test/kbd_tag.html new file mode 100644 index 00000000..4c2b92a2 --- /dev/null +++ b/test/kbd_tag.html @@ -0,0 +1 @@ +Press [CTRL]+c to copy. diff --git a/test/kbd_tag.md b/test/kbd_tag.md new file mode 100644 index 00000000..7fd30339 --- /dev/null +++ b/test/kbd_tag.md @@ -0,0 +1,2 @@ +Press `[CTRL]+c` to copy. + From 8673bdcf5d7b91e279a60f6963e0cbd536d25d97 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sun, 24 Dec 2017 16:02:54 -0600 Subject: [PATCH 369/479] Support the tag The opening and closing quotation marks can be customized. --- ChangeLog.rst | 1 + html2text/__init__.py | 12 ++++++++++++ test/q_tag.html | 1 + test/q_tag.md | 2 ++ 4 files changed, 16 insertions(+) create mode 100644 test/q_tag.html create mode 100644 test/q_tag.md diff --git a/ChangeLog.rst b/ChangeLog.rst index 8395d981..2f362d1d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ ---- * Add support for the tag. +* Add support for the tag. 2017.10.4 diff --git a/html2text/__init__.py b/html2text/__init__.py index 2acbfe41..4e0701e5 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -114,6 +114,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.pre = 0 self.startpre = 0 self.code = False + self.quote = False self.br_toggle = '' self.lastWasNL = 0 self.lastWasList = False @@ -437,6 +438,17 @@ def no_preceding_space(self): self.abbr_title = None self.abbr_data = '' + if tag == "q": + # Different languages use different quotation marks. + # It should be easy to change the punctuation characters. + if not self.quote: + # Open quote + self.o('"') + else: + # Close quote + self.o('"') + self.quote = not self.quote + def link_url(self, link, title=""): url = urlparse.urljoin(self.baseurl, link) title = ' "{0}"'.format(title) if title.strip() else '' diff --git a/test/q_tag.html b/test/q_tag.html new file mode 100644 index 00000000..63e03608 --- /dev/null +++ b/test/q_tag.html @@ -0,0 +1 @@ +If this is a test, he said, then it should pass. diff --git a/test/q_tag.md b/test/q_tag.md new file mode 100644 index 00000000..be2109bc --- /dev/null +++ b/test/q_tag.md @@ -0,0 +1,2 @@ +"If this is a test," he said, "then it should pass". + From ba2bec7b77fc55672368b2ff9fa9c751940db470 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sat, 6 Jan 2018 23:33:01 -0600 Subject: [PATCH 370/479] Allow users to configure quote characters --- html2text/__init__.py | 10 ++++------ html2text/cli.py | 18 ++++++++++++++++++ html2text/config.py | 7 +++++++ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 4e0701e5..46ae36f1 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -89,6 +89,8 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.pad_tables = config.PAD_TABLES # covered in cli self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli self.tag_callback = None + self.open_quote = config.OPEN_QUOTE # covered in cli + self.close_quote = config.CLOSE_QUOTE # covered in cli if out is None: # pragma: no cover self.out = self.outtextf @@ -439,14 +441,10 @@ def no_preceding_space(self): self.abbr_data = '' if tag == "q": - # Different languages use different quotation marks. - # It should be easy to change the punctuation characters. if not self.quote: - # Open quote - self.o('"') + self.o(self.open_quote) else: - # Close quote - self.o('"') + self.o(self.close_quote) self.quote = not self.quote def link_url(self, link, title=""): diff --git a/html2text/cli.py b/html2text/cli.py index e155e4fb..99b563b0 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -215,6 +215,22 @@ class bcolors: # pragma: no cover help="What to do in case of decode errors.'ignore', 'strict' and " "'replace' are acceptable values" ) + p.add_option( + "--open-quote", + dest="open_quote", + action="store", + type="str", + default=config.OPEN_QUOTE, + help="The character used to open quotes (default: '{}')".format(config.OPEN_QUOTE) + ) + p.add_option( + "--close-quote", + dest="close_quote", + action="store", + type="str", + default=config.CLOSE_QUOTE, + help="The character used to close quotes (default: '{}')".format(config.CLOSE_QUOTE) + ) (options, args) = p.parse_args() # process input @@ -302,5 +318,7 @@ def detect(x): h.wrap_links = options.wrap_links h.pad_tables = options.pad_tables h.default_image_alt = options.default_image_alt + h.open_quote = options.open_quote + h.close_quote = options.close_quote wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index d0656b0a..6ccf3875 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re # Use Unicode characters instead of their ascii pseudo-replacements @@ -136,3 +138,8 @@ # Use a single line break after a block element rather than two line breaks. # NOTE: Requires body width setting to be 0. SINGLE_LINE_BREAK = False + + +# Use double quotation marks when converting the tag. +OPEN_QUOTE = '"' +CLOSE_QUOTE = '"' From 64e5773fc27dc3dbcd9f0c8c28d5d9d882c2f729 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sun, 7 Jan 2018 00:22:42 -0600 Subject: [PATCH 371/479] Fix unit test errors found by Travis CI --- html2text/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index 99b563b0..0451e02c 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -221,7 +221,7 @@ class bcolors: # pragma: no cover action="store", type="str", default=config.OPEN_QUOTE, - help="The character used to open quotes (default: '{}')".format(config.OPEN_QUOTE) + help="The character used to open quotes", ) p.add_option( "--close-quote", @@ -229,7 +229,7 @@ class bcolors: # pragma: no cover action="store", type="str", default=config.CLOSE_QUOTE, - help="The character used to close quotes (default: '{}')".format(config.CLOSE_QUOTE) + help="The character used to close quotes", ) (options, args) = p.parse_args() From 82544d4f7b4b584addcb1e44962a9749ccd245f4 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Mon, 8 Jan 2018 21:57:04 -0600 Subject: [PATCH 372/479] Document the new options related to the tag --- docs/usage.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 8a81629f..cc80f1c1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -96,6 +96,8 @@ simple indications of their function. - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False) - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values. - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage + - OPEN_QUOTE is the character used to open a quote when replacing the `` tag. It defaults to `"`. + - CLOSE_QUOTE is the character used to close a quote when replacing the `` tag. It defaults to `"`. To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. @@ -136,3 +138,5 @@ Command line options | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. | `--pad-tables` | Use padding to make tables look good. | `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values. +| `--open-quote`=`"` | Inserts the given text when opening a quote. Defaults to `"`. +| `--close-quote`=`"` | Inserts the given text when closing a quote. Defaults to `"`. From eb9a3482198b16ae7c649dc0853daf4c03ebf3bc Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Tue, 9 Jan 2018 08:28:44 +0400 Subject: [PATCH 373/479] Update ChangeLog.rst --- ChangeLog.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 2f362d1d..c14750e2 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,9 +1,10 @@ -0000.00.00 -========== +2018.9.1 +======== ---- -* Add support for the tag. -* Add support for the tag. +* Fix #188: Non-ASCII in title attribute causes encode error. +* Feature #194: Add support for the tag. +* Feature #193: Add support for the tag. 2017.10.4 From d903cf6e5653f68e449f0aa1edfc2e0cab704a55 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Tue, 9 Jan 2018 04:48:13 +0000 Subject: [PATCH 374/479] Update versions to latest --- html2text/__init__.py | 2 +- requirements-dev.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 46ae36f1..b68de99d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -38,7 +38,7 @@ # python3 uses chr nochr = str('') -__version__ = (2017, 10, 4) +__version__ = (2018, 9, 1) # TODO: diff --git a/requirements-dev.txt b/requirements-dev.txt index 800eda13..7288811a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -coverage==4.4.1 -py==1.4.34 +coverage==4.4.2 +py==1.5.2 pypandoc==1.4 wheel==0.30.0 -flake8==3.4.1 +flake8==3.5.0 From bef7ec082173c52fa8ea89c17ee67398e96ed74b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 10 Jan 2018 06:03:19 +0000 Subject: [PATCH 375/479] update version to the correct one --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index b68de99d..a04fc751 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -38,7 +38,7 @@ # python3 uses chr nochr = str('') -__version__ = (2018, 9, 1) +__version__ = (2018, 1, 9) # TODO: From be182cb4337ed69f0103a3c4b1b194a515d02591 Mon Sep 17 00:00:00 2001 From: Eddie Lebow Date: Thu, 28 Jun 2018 23:09:58 -0400 Subject: [PATCH 376/479] Add usage documentation for emphasis_mark and strong_mark --- docs/usage.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index cc80f1c1..d1ccdff6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -99,6 +99,11 @@ simple indications of their function. - OPEN_QUOTE is the character used to open a quote when replacing the `` tag. It defaults to `"`. - CLOSE_QUOTE is the character used to close a quote when replacing the `` tag. It defaults to `"`. +Options that are not in the config.py file: + + - emphasis_mark is the character used when replacing the `` tag. It defaults to `_`. + - strong_mark is the characer used when replacing the `` tag. It defaults to `**`. + To alter any option the procedure is to create a parser with `parser = html2text.HTML2Text()` and to set the option on the parser. example: `parser.unicode_snob = True` to set the UNICODE_SNOB option. From 40a9c616ee28befb678f6338130d3beda07a9931 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacek=20Ko=C5=82odziej?= Date: Sat, 14 Jul 2018 18:42:57 +0200 Subject: [PATCH 377/479] Fix handling ‎ and ‏ marks - mid-text within stressed tags - right after stressed tags Fixes #201. --- ChangeLog.rst | 7 +++++++ html2text/__init__.py | 11 ++++++++++- test/lrm_after_i.html | 1 + test/lrm_after_i.md | 2 ++ test/lrm_inside_i.html | 1 + test/lrm_inside_i.md | 2 ++ test/rlm_inside_strong.html | 1 + test/rlm_inside_strong.md | 2 ++ 8 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 test/lrm_after_i.html create mode 100644 test/lrm_after_i.md create mode 100644 test/lrm_inside_i.html create mode 100644 test/lrm_inside_i.md create mode 100644 test/rlm_inside_strong.html create mode 100644 test/rlm_inside_strong.md diff --git a/ChangeLog.rst b/ChangeLog.rst index c14750e2..37d66c4f 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +0000.00.00 +========== +---- + +* Fix #201: handle ‎/‏ marks mid-text within stressed tags or right after stressed tags + + 2018.9.1 ======== ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index a04fc751..06b8a24d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -186,7 +186,16 @@ def handle_charref(self, c): self.handle_data(self.charref(c), True) def handle_entityref(self, c): - self.handle_data(self.entityref(c), True) + ref = self.entityref(c) + + # ref may be an empty string (e.g. for ‎/‏ markers that should + # not contribute to the final output). + # self.handle_data cannot handle a zero-length string right after a + # stressed tag or mid-text within a stressed tag (text get split and + # self.stressed/self.preceding_stressed gets switched after the first + # part of that text). + if ref: + self.handle_data(ref, True) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) diff --git a/test/lrm_after_i.html b/test/lrm_after_i.html new file mode 100644 index 00000000..ab65a679 --- /dev/null +++ b/test/lrm_after_i.html @@ -0,0 +1 @@ +Foo‎ \ No newline at end of file diff --git a/test/lrm_after_i.md b/test/lrm_after_i.md new file mode 100644 index 00000000..46c9efa1 --- /dev/null +++ b/test/lrm_after_i.md @@ -0,0 +1,2 @@ +_Foo_ + diff --git a/test/lrm_inside_i.html b/test/lrm_inside_i.html new file mode 100644 index 00000000..1e51164d --- /dev/null +++ b/test/lrm_inside_i.html @@ -0,0 +1 @@ +Foo‎bar \ No newline at end of file diff --git a/test/lrm_inside_i.md b/test/lrm_inside_i.md new file mode 100644 index 00000000..48926295 --- /dev/null +++ b/test/lrm_inside_i.md @@ -0,0 +1,2 @@ +_Foo bar_ + diff --git a/test/rlm_inside_strong.html b/test/rlm_inside_strong.html new file mode 100644 index 00000000..34b58f72 --- /dev/null +++ b/test/rlm_inside_strong.html @@ -0,0 +1 @@ +Foo‏bar \ No newline at end of file diff --git a/test/rlm_inside_strong.md b/test/rlm_inside_strong.md new file mode 100644 index 00000000..a172d71a --- /dev/null +++ b/test/rlm_inside_strong.md @@ -0,0 +1,2 @@ +**Foo bar** + From f62100c32451803d559d79c7bbd3000a30429878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacek=20Ko=C5=82odziej?= Date: Sun, 15 Jul 2018 12:00:50 +0200 Subject: [PATCH 378/479] Update AUTHORS --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 488eeec0..6cdb3b80 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -27,6 +27,7 @@ The AUTHORS/Contributors are (and/or have been): * Jonathan Sundqvist * Simon Meers * Kurt McKee +* Jacek Kołodziej Maintainer: From f8fda1de84014e776ac4f5391c10f5fefc03fd83 Mon Sep 17 00:00:00 2001 From: GermainZ Date: Sun, 21 Oct 2018 00:34:48 +0200 Subject: [PATCH 379/479] Add support for wrapping list items --- AUTHORS.rst | 1 + ChangeLog.rst | 6 ++++++ docs/usage.md | 2 ++ html2text/__init__.py | 15 ++++++++++----- html2text/cli.py | 8 ++++++++ html2text/config.py | 3 +++ html2text/utils.py | 4 ++-- test/test_html2text.py | 4 ++++ test/wrap_list_items_example.html | 13 +++++++++++++ test/wrap_list_items_example.md | 14 ++++++++++++++ 10 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 test/wrap_list_items_example.html create mode 100644 test/wrap_list_items_example.md diff --git a/AUTHORS.rst b/AUTHORS.rst index 488eeec0..43a03538 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -27,6 +27,7 @@ The AUTHORS/Contributors are (and/or have been): * Jonathan Sundqvist * Simon Meers * Kurt McKee +* Germain Z. Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index c14750e2..3882c2b3 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,9 @@ +0000.0.0 +======== +---- + +* Add support for wrapping list items. + 2018.9.1 ======== ---- diff --git a/docs/usage.md b/docs/usage.md index d1ccdff6..8263207c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -94,6 +94,7 @@ simple indications of their function. - USE_AUTOMATIC_LINKS to convert http://xyz to - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False) + - WRAP_LIST_ITEMS to decide if list items have to be wrapped during text wrapping - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values. - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage - OPEN_QUOTE is the character used to open a quote when replacing the `` tag. It defaults to `"`. @@ -140,6 +141,7 @@ Command line options | `--links-after-para` | Put the links after the paragraph and not at end of document | `--mark-code` | Mark code with [code]...[/code] blocks | `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links` +| `--wrap-list-items` | Wrap list items during text wrapping. | `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc. | `--pad-tables` | Use padding to make tables look good. | `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values. diff --git a/html2text/__init__.py b/html2text/__init__.py index a04fc751..b876118e 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -85,6 +85,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE + self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli self.wrap_links = config.WRAP_LINKS # covered in cli self.pad_tables = config.PAD_TABLES # covered in cli self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli @@ -904,11 +905,15 @@ def optwrap(self, text): self.inline_links = False for para in text.split("\n"): if len(para) > 0: - if not skipwrap(para, self.wrap_links): - result += "\n".join( - wrap(para, self.body_width, break_long_words=False) - ) - if para.endswith(' '): + if not skipwrap(para, self.wrap_links, self.wrap_list_items): + indent = '' + if para.startswith(' ' + self.ul_item_mark): + indent = ' ' # For list items. + wrapped = wrap(para, self.body_width, + break_long_words=False, + subsequent_indent=indent) + result += "\n".join(wrapped) + if indent or para.endswith(' '): result += " \n" newlines = 1 else: diff --git a/html2text/cli.py b/html2text/cli.py index 0451e02c..43731ae3 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -44,6 +44,13 @@ class bcolors: # pragma: no cover default=config.WRAP_LINKS, help="wrap links during conversion" ) + p.add_option( + "--wrap-list-items", + dest="wrap_list_items", + action="store_true", + default=config.WRAP_LIST_ITEMS, + help="wrap list items during conversion" + ) p.add_option( "--ignore-emphasis", dest="ignore_emphasis", @@ -316,6 +323,7 @@ def detect(x): h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code h.wrap_links = options.wrap_links + h.wrap_list_items = options.wrap_list_items h.pad_tables = options.pad_tables h.default_image_alt = options.default_image_alt h.open_quote = options.open_quote diff --git a/html2text/config.py b/html2text/config.py index 6ccf3875..5cbc35ad 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -30,6 +30,9 @@ # WRAP_LINKS = True WRAP_LINKS = True +# Wrap list items. +WRAP_LIST_ITEMS = False + # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 diff --git a/html2text/utils.py b/html2text/utils.py index d382e00f..0f15469a 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -171,7 +171,7 @@ def list_numbering_start(attrs): return 0 -def skipwrap(para, wrap_links): +def skipwrap(para, wrap_links, wrap_list_items): # If it appears to contain a link # don't wrap if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links: @@ -191,7 +191,7 @@ def skipwrap(para, wrap_links): # but there's a
    -inside- case in one of the tests that # also depends upon it. if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**': - return True + return not wrap_list_items # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either diff --git a/test/test_html2text.py b/test/test_html2text.py index 4d8cd2e9..76ecb8f4 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -220,6 +220,10 @@ def _test_func(self): module_args['pad_tables'] = True cmdline_args.append('--pad-tables') + if base_fn.startswith('wrap_list_items'): + module_args['wrap_list_items'] = True + cmdline_args.append('--wrap-list-items') + if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: test_func = None else: diff --git a/test/wrap_list_items_example.html b/test/wrap_list_items_example.html new file mode 100644 index 00000000..26d5d9de --- /dev/null +++ b/test/wrap_list_items_example.html @@ -0,0 +1,13 @@ +
      +
    • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
    • +
    • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
    • +
    + +Text between lists. + +
      +
    • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
    • +
    • One two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty.
    • +
    + +Text after list. diff --git a/test/wrap_list_items_example.md b/test/wrap_list_items_example.md new file mode 100644 index 00000000..0708078d --- /dev/null +++ b/test/wrap_list_items_example.md @@ -0,0 +1,14 @@ + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + +Text between lists. + + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + * One two three four five six seven eight nine ten eleven twelve thirteen + fourteen fifteen sixteen seventeen eighteen nineteen twenty. + +Text after list. + From 4a73e6472faba50d5a95fa3b4736bb180ab07a9a Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Mon, 22 Oct 2018 16:16:31 -0400 Subject: [PATCH 380/479] `images_as_html` control --- AUTHORS.rst | 1 + ChangeLog.rst | 7 +++++++ docs/how_it_works.md | 2 ++ docs/usage.md | 2 ++ html2text/__init__.py | 5 +++-- html2text/cli.py | 9 +++++++++ html2text/config.py | 1 + test/images_as_html.html | 10 ++++++++++ test/images_as_html.md | 7 +++++++ test/test_html2text.py | 4 ++++ 10 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 test/images_as_html.html create mode 100644 test/images_as_html.md diff --git a/AUTHORS.rst b/AUTHORS.rst index 488eeec0..a020ed2a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -27,6 +27,7 @@ The AUTHORS/Contributors are (and/or have been): * Jonathan Sundqvist * Simon Meers * Kurt McKee +* Jonathan Vanasco Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index c14750e2..c9c3347f 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,6 +2,13 @@ ======== ---- +* Feature #213: `images_as_html` config option to always generate an `img` html tag. preserves "height", "width" and "alt" if possible. + + +2018.9.1 +======== +---- + * Fix #188: Non-ASCII in title attribute causes encode error. * Feature #194: Add support for the tag. * Feature #193: Add support for the tag. diff --git a/docs/how_it_works.md b/docs/how_it_works.md index 1b52af31..b9ef81ee 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -26,6 +26,7 @@ Used to provide various configuration settings to the converter. They are as fol - GOOGLE_LIST_INDENT no of pixels to indent nested lists - IGNORE_ANCHORS - IGNORE_IMAGES + - IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible. - IMAGES_TO_ALT - IMAGES_WITH_SIZE - IGNORE_EMPHASIS @@ -86,6 +87,7 @@ Command line interface for the code. |`--ignore-images` | Do not include any formatting for images |`--images-to-alt` | Discard image data, only keep alt text |`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions +|`--images-as-html` | Always write image tags as raw html; preserves "height", "width" and "alt" if possible. |`-g`, `--google-doc` | Convert an html-exported Google Document |`-d`, `--dash-unordered-list` | Use a dash rather than a star for unordered list items |`-b` `BODY_WIDTH`, `--body-width`=`BODY_WIDTH` | Number of characters per output line, `0` for no wrap diff --git a/docs/usage.md b/docs/usage.md index d1ccdff6..2b4c492e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -72,6 +72,7 @@ simple indications of their function. - GOOGLE_LIST_INDENT no of pixels to indent nested lists - IGNORE_ANCHORS - IGNORE_IMAGES + - IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible. - IMAGES_TO_ALT - IMAGES_WITH_SIZE - IGNORE_EMPHASIS @@ -120,6 +121,7 @@ Command line options | `--ignore-links` | Do not include any formatting for links |`--protect-links` | Protect links from line breaks surrounding them "+" with angle brackets |`--ignore-images` | Do not include any formatting for images +|`--images-as-html` | Always write image tags as raw html; preserves "height", "width" and "alt" if possible. |`--images-to-alt` | Discard image data, only keep alt text |`--images-with-size` | Write image tags with height and width attrs as raw html to retain dimensions |`-g`, `--google-doc` | Convert an html-exported Google Document diff --git a/html2text/__init__.py b/html2text/__init__.py index a04fc751..1428676b 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -72,6 +72,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli self.ignore_links = config.IGNORE_ANCHORS # covered in cli self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_as_html = config.IMAGES_AS_HTML # covered in cli self.images_to_alt = config.IMAGES_TO_ALT # covered in cli self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli @@ -503,8 +504,8 @@ def link_url(self, link, title=""): # If we have images_with_size, write raw html including width, # height, and alt attributes - if self.images_with_size and \ - ("width" in attrs or "height" in attrs): + if self.images_as_html or (self.images_with_size and \ + ("width" in attrs or "height" in attrs)): self.o(" + +An image with a width attr + +An image with a height attr + +An image with width and height + + + diff --git a/test/images_as_html.md b/test/images_as_html.md new file mode 100644 index 00000000..91f1cc18 --- /dev/null +++ b/test/images_as_html.md @@ -0,0 +1,7 @@ + An image with a width attr An image with a height attr +An
+image with width and height + diff --git a/test/test_html2text.py b/test/test_html2text.py index 4d8cd2e9..4b740abf 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -190,6 +190,10 @@ def _test_func(self): module_args['protect_links'] = True cmdline_args.append('--protect-links') + if base_fn.startswith('images_as_html'): + module_args['images_as_html'] = True + cmdline_args.append('--images-as-html') + if base_fn.startswith('images_to_alt'): module_args['images_to_alt'] = True cmdline_args.append('--images-to-alt') From 5a64a5d0de3e3b2b4f6a152f833831fdae8e3b3c Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Mon, 3 Dec 2018 09:31:45 +0100 Subject: [PATCH 381/479] Add missing space in decoding error message --- html2text/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index 0451e02c..516b20b8 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -283,7 +283,7 @@ def detect(x): except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += ' Use the ' + bcolors.OKGREEN - warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' + warning += '--decode-errors=ignore' + bcolors.ENDC + ' flag.' print(warning) raise err From 45a71e0a2b23eb1ad9b13e5af032f90ab7d98740 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Mon, 3 Dec 2018 09:51:11 +0100 Subject: [PATCH 382/479] Simplify decoding bytes.decode() in Python 2.6 does have the errors argument, but it is positional only. --- html2text/cli.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/html2text/cli.py b/html2text/cli.py index 0451e02c..82f373dc 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -275,11 +275,7 @@ def detect(x): if hasattr(data, 'decode'): try: - try: - data = data.decode(encoding, errors=options.decode_errors) - except TypeError: - # python 2.6.x does not have the errors option - data = data.decode(encoding) + data = data.decode(encoding, options.decode_errors) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += ' Use the ' + bcolors.OKGREEN From ced1b06eb75f09342f75f23152298a87d9d5e64d Mon Sep 17 00:00:00 2001 From: cclauss Date: Mon, 7 Jan 2019 13:40:24 +0100 Subject: [PATCH 383/479] Travis CI: Add Python 3.7 to testing --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index e6ecf19e..ee153960 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,10 @@ python: - "3.4" - "3.5" - "3.6" +matrix: + include: + - python: "3.7" + dist: xenial # required for Python >= 3.7 (travis-ci/travis-ci#9069) before_install: # Python 3.2 needs setuptools < 30 - support for Py3.2 was dropped after that version - if [[ $TRAVIS_PYTHON_VERSION == '3.2' || $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then pip install -U 'setuptools<30'; fi From b7495c9aef98b086c5cb4b29284094bc20c90ea7 Mon Sep 17 00:00:00 2001 From: cclauss Date: Mon, 7 Jan 2019 13:54:15 +0100 Subject: [PATCH 384/479] Drop Python 3.2 support to satisfy Cython --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ee153960..bcaad5d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ python: - "2.7" - "pypy" - "pypy3" - - "3.2" + # - "3.2" # Cython needs Python 2.6+ or 3.3+ - "3.3" - "3.4" - "3.5" From bf9360f987c3013e1b4294d340fc6097925712a6 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Fri, 22 Feb 2019 10:05:35 +0000 Subject: [PATCH 385/479] Fix change log version name --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 4562c734..fd3364da 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -6,7 +6,7 @@ * Fix #201: handle ‎/‏ marks mid-text within stressed tags or right after stressed tags. * Feature #213: `images_as_html` config option to always generate an `img` html tag. preserves "height", "width" and "alt" if possible. -2018.9.1 +2018.1.9 ======== ---- From 53497e7279c6008b083392476e2d5aff0b5af3c4 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Fri, 22 Feb 2019 18:05:40 -0800 Subject: [PATCH 386/479] Use HTTPS in docs and comments where available --- ChangeLog.rst | 2 +- README.md | 10 +++++----- docs/index.md | 2 +- docs/usage.md | 2 +- test/test_html2text.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 4562c734..33f8c6ea 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -183,7 +183,7 @@ ---- * Feature #29, #27: Add simple table support with bypass option. -* Fix #20: Replace project website with: http://alir3z4.github.io/html2text/ . +* Fix #20: Replace project website with: https://alir3z4.github.io/html2text/ . 2014.9.8 diff --git a/README.md b/README.md index 56572c40..2cf6c2e6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # html2text -[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) +[![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](https://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) [![Downloads](http://badge.kloud51.com/pypi/d/html2text.png)](https://pypi.python.org/pypi/html2text/) [![Version](http://badge.kloud51.com/pypi/v/html2text.png)](https://pypi.python.org/pypi/html2text/) @@ -44,17 +44,17 @@ Or with some configuration options: >>> h = html2text.HTML2Text() >>> # Ignore converting links from HTML >>> h.ignore_links = True ->>> print h.handle("

    Hello, world!") +>>> print h.handle("

    Hello, world!") Hello, world! ->>> print(h.handle("

    Hello, world!")) +>>> print(h.handle("

    Hello, world!")) Hello, world! >>> # Don't Ignore links anymore, I like links >>> h.ignore_links = False ->>> print(h.handle("

    Hello, world!")) -Hello, [world](http://earth.google.com/)! +>>> print(h.handle("

    Hello, world!")) +Hello, [world](https://www.google.com/earth/)! ``` diff --git a/docs/index.md b/docs/index.md index 98529239..5e299d8d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,7 +3,7 @@ Html2Text 1. [About](about.md) 2. [Authors](../AUTHORS.rst) -3. [What is markdown](http://daringfireball.net/projects/markdown/) +3. [What is markdown](https://daringfireball.net/projects/markdown/) 4. [How it works](how_it_works.md) 5. [Usage](usage.md) 6. [Contributing](contributing.md) diff --git a/docs/usage.md b/docs/usage.md index d34c6c93..4cbb232b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -138,7 +138,7 @@ Command line options | `--ignore-emphasis` | Ignore all emphasis formatting in the html. | `-e`, `--asterisk-emphasis` | Use asterisk rather than underscore to emphasize text | `--unicode-snob` | Use unicode throughout instead of ASCII -| `--no-automatic-links` | Do not use automatic links like +| `--no-automatic-links` | Do not use automatic links like | `--no-skip-internal-links` | Turn off skipping of internal links | `--links-after-para` | Put the links after the paragraph and not at end of document | `--mark-code` | Mark code with [code]...[/code] blocks diff --git a/test/test_html2text.py b/test/test_html2text.py index 9b7e735a..2f3878a4 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -244,7 +244,7 @@ def _test_func(self): return _test_mod, test_cmd, test_func -# Originally from http://stackoverflow.com/questions/32899/\ +# Originally from https://stackoverflow.com/questions/32899/how-do-you-generate-dynamic-parameterized-unit-tests-in-python # how-to-generate-dynamic-parametrized-unit-tests-in-python test_dir_name = os.path.dirname(os.path.realpath(__file__)) for fn in glob.glob("%s/*.html" % test_dir_name): From 224fcb66133388fcc3cd46cf707b76913c99d450 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Fri, 22 Feb 2019 18:31:28 -0800 Subject: [PATCH 387/479] Mark wheel as universal https://wheel.readthedocs.io/en/stable/user_guide.html?highlight=universal#user-guide --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index b88034e4..1eee7db0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,5 @@ +[bdist_wheel] +universal = 1 + [metadata] description-file = README.md From d955dd65d14989c1ae5ac168fddc7c18edd08fa1 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Fri, 22 Feb 2019 18:41:00 -0800 Subject: [PATCH 388/479] Correct capitalization of PyPI as spelled on https://pypi.org/ --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 56572c40..e955fcf2 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Hello, [world](http://earth.google.com/)! ## How to install -`html2text` is available on pypi +`html2text` is available on PyPI https://pypi.python.org/pypi/html2text ``` From 2973cd71497eef9d433b90252e182abcf09f6327 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Fri, 22 Feb 2019 18:49:58 -0800 Subject: [PATCH 389/479] Update all pypi.python.org URLs to pypi.org For details on the new PyPI, see the blog post: https://pythoninsider.blogspot.ca/2018/04/new-pypi-launched-legacy-pypi-shutting.html --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 56572c40..d80a1c97 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ [![Build Status](https://secure.travis-ci.org/Alir3z4/html2text.png)](http://travis-ci.org/Alir3z4/html2text) [![Coverage Status](https://coveralls.io/repos/Alir3z4/html2text/badge.png)](https://coveralls.io/r/Alir3z4/html2text) -[![Downloads](http://badge.kloud51.com/pypi/d/html2text.png)](https://pypi.python.org/pypi/html2text/) -[![Version](http://badge.kloud51.com/pypi/v/html2text.png)](https://pypi.python.org/pypi/html2text/) -[![Wheel?](http://badge.kloud51.com/pypi/wheel/html2text.png)](https://pypi.python.org/pypi/html2text/) -[![Format](http://badge.kloud51.com/pypi/format/html2text.png)](https://pypi.python.org/pypi/html2text/) -[![License](http://badge.kloud51.com/pypi/license/html2text.png)](https://pypi.python.org/pypi/html2text/) +[![Downloads](http://badge.kloud51.com/pypi/d/html2text.png)](https://pypi.org/project/html2text/) +[![Version](http://badge.kloud51.com/pypi/v/html2text.png)](https://pypi.org/project/html2text/) +[![Wheel?](http://badge.kloud51.com/pypi/wheel/html2text.png)](https://pypi.org/project/html2text/) +[![Format](http://badge.kloud51.com/pypi/format/html2text.png)](https://pypi.org/project/html2text/) +[![License](http://badge.kloud51.com/pypi/license/html2text.png)](https://pypi.org/project/html2text/) html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). @@ -64,7 +64,7 @@ Hello, [world](http://earth.google.com/)! ## How to install `html2text` is available on pypi -https://pypi.python.org/pypi/html2text +https://pypi.org/project/html2text/ ``` $ pip install html2text From 680b2c9bb6e8be013087e8a300f79bd195d1d4c6 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 23 Feb 2019 09:09:51 -0800 Subject: [PATCH 390/479] Fix flake8 warnings to get Travis to be green --- html2text/__init__.py | 6 ++++-- html2text/cli.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 2eeb0292..e20050b2 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -514,8 +514,10 @@ def link_url(self, link, title=""): # If we have images_with_size, write raw html including width, # height, and alt attributes - if self.images_as_html or (self.images_with_size and \ - ("width" in attrs or "height" in attrs)): + if self.images_as_html or ( + self.images_with_size and + ("width" in attrs or "height" in attrs) + ): self.o(" Date: Fri, 22 Feb 2019 17:07:30 -0800 Subject: [PATCH 391/479] Drop support for EOL Pythons Python 2.6 and 3.3 are end of life. They are no longer receiving bug fixes, including for security issues. Python 2.6 went EOL on 2013-10-29 and on 2017-09-29. For additional details on support Python versions, see: Supported: https://devguide.python.org/#status-of-python-branches EOL: https://devguide.python.org/devcycle/#end-of-life-branches Removing support for EOL Pythons will reduce testing and maintenance resources while allowing the library to move towards a modern Python 3 style. Using pypinfo, we can show the PyPI download statistics, show very low numbers for EOL Pythons. | python_version | percent | download_count | | -------------- | ------: | -------------: | | 2.7 | 60.79% | 251,480 | | 3.6 | 29.19% | 120,760 | | 3.5 | 6.70% | 27,712 | | 3.7 | 2.64% | 10,928 | | 3.4 | 0.67% | 2,790 | | 2.6 | 0.01% | 27 | | 3.8 | 0.01% | 21 | | 3.2 | 0.00% | 1 | | 3.3 | 0.00% | 1 | | Total | | 413,720 | This allows for some code clean ups: - Modern setuptools provides a test command - Modern setuptools supports Markdown with long_description_content_type - unittest2 features are now always included by default - textwrap is always available - Can use automatic positions in string formatting - Can use dict comprehensions - Use python_requires to guard against outdated Pythons - Use context manager - Assume .decode() method exists - Upgrade deprecated optparse to argparse For additional opinions for why it would be good to drop Python 2.6, see: https://snarky.ca/stop-using-python-2-6/ http://www.curiousefficiency.org/posts/2015/04/stop-supporting-python26.html https://hugovk.github.io/drop-python/2.6/ --- .travis.yml | 27 ++---- ChangeLog.rst | 1 + README.md | 4 +- html2text/__init__.py | 21 ++--- html2text/cli.py | 200 ++++++++++++++++++----------------------- html2text/config.py | 2 +- html2text/utils.py | 15 ++-- requirements-dev.txt | 1 - setup.py | 76 ++++------------ test/test_html2text.py | 11 +-- test/test_memleak.py | 6 +- 11 files changed, 131 insertions(+), 233 deletions(-) diff --git a/.travis.yml b/.travis.yml index bcaad5d6..95c71d03 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,31 +1,22 @@ +dist: xenial language: python python: - - "2.6" - "2.7" - - "pypy" - - "pypy3" - # - "3.2" # Cython needs Python 2.6+ or 3.3+ - - "3.3" - "3.4" - "3.5" - "3.6" -matrix: - include: - - python: "3.7" - dist: xenial # required for Python >= 3.7 (travis-ci/travis-ci#9069) -before_install: - # Python 3.2 needs setuptools < 30 - support for Py3.2 was dropped after that version - - if [[ $TRAVIS_PYTHON_VERSION == '3.2' || $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then pip install -U 'setuptools<30'; fi + - "3.7" + - "pypy2.7-6.0" + - "pypy3.5-6.0" install: - - pip install coveralls==0.5 + - pip install coveralls before_script: - - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - # flake8 doesn't support Python 2.6 - - if [[ $TRAVIS_PYTHON_VERSION != '2.6' ]]; then pip install flake8==3.3.0;flake8 .; fi + - pip install flake8 + - flake8 - export COVERAGE_PROCESS_START=$PWD/.coveragerc script: - - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text --rcfile=.coveragerc setup.py test -v + - coverage run --source=html2text setup.py test - coverage combine - coverage report after_success: - coveralls --rcfile=.coveragerc + coveralls diff --git a/ChangeLog.rst b/ChangeLog.rst index 4562c734..50ab287b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,6 +5,7 @@ * Add support for wrapping list items. * Fix #201: handle ‎/‏ marks mid-text within stressed tags or right after stressed tags. * Feature #213: `images_as_html` config option to always generate an `img` html tag. preserves "height", "width" and "alt" if possible. +* Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. 2018.9.1 ======== diff --git a/README.md b/README.md index 56572c40..d57e7084 100644 --- a/README.md +++ b/README.md @@ -73,10 +73,10 @@ $ pip install html2text ## How to run unit tests - PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v + coverage run --source=html2text setup.py test To see the coverage results: - + coverage combine coverage html diff --git a/html2text/__init__.py b/html2text/__init__.py index e20050b2..333906c5 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -5,11 +5,7 @@ from __future__ import unicode_literals import re import sys - -try: - from textwrap import wrap -except ImportError: # pragma: no cover - pass +from textwrap import wrap from html2text.compat import urlparse, HTMLParser from html2text import config @@ -460,7 +456,7 @@ def no_preceding_space(self): def link_url(self, link, title=""): url = urlparse.urljoin(self.baseurl, link) - title = ' "{0}"'.format(title) if title.strip() else '' + title = ' "{}"'.format(title) if title.strip() else '' self.o(']({url}{title})'.format(url=escape_md(url), title=title)) @@ -637,14 +633,14 @@ def link_url(self, link, title=""): self.soft_br() if tag in ["td", "th"]: if start: - self.o('<{0}>\n\n'.format(tag)) + self.o('<{}>\n\n'.format(tag)) else: - self.o('\n'.format(tag)) + self.o('\n'.format(tag)) else: if start: - self.o('<{0}>'.format(tag)) + self.o('<{}>'.format(tag)) else: - self.o(''.format(tag)) + self.o(''.format(tag)) else: if tag == "table": @@ -849,7 +845,7 @@ def charref(self, name): else: c = int(name) - if not self.unicode_snob and c in unifiable_n.keys(): + if not self.unicode_snob and c in unifiable_n: return unifiable_n[c] else: try: @@ -858,7 +854,7 @@ def charref(self, name): return '' def entityref(self, c): - if not self.unicode_snob and c in config.UNIFIABLE.keys(): + if not self.unicode_snob and c in config.UNIFIABLE: return config.UNIFIABLE[c] else: try: @@ -907,7 +903,6 @@ def optwrap(self, text): if not self.body_width: return text - assert wrap, "Requires Python 2.3." result = '' newlines = 0 # I cannot think of a better solution for now. diff --git a/html2text/cli.py b/html2text/cli.py index 7645848f..78ece65a 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,4 +1,4 @@ -import optparse +import argparse import warnings from html2text.compat import urllib @@ -19,73 +19,68 @@ class bcolors: # pragma: no cover BOLD = '\033[1m' UNDERLINE = '\033[4m' - p = optparse.OptionParser( - '%prog [(filename|url) [encoding]]', - version='%prog ' + ".".join(map(str, __version__)) - ) - p.add_option( + p = argparse.ArgumentParser() + p.add_argument( "--default-image-alt", dest="default_image_alt", - action="store", - type="str", default=config.DEFAULT_IMAGE_ALT, help="The default alt string for images with missing ones") - p.add_option( + p.add_argument( "--pad-tables", dest="pad_tables", action="store_true", default=config.PAD_TABLES, help="pad the cells to equal column width in tables" ) - p.add_option( + p.add_argument( "--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, help="wrap links during conversion" ) - p.add_option( + p.add_argument( "--wrap-list-items", dest="wrap_list_items", action="store_true", default=config.WRAP_LIST_ITEMS, help="wrap list items during conversion" ) - p.add_option( + p.add_argument( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis" ) - p.add_option( + p.add_argument( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links" ) - p.add_option( + p.add_argument( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") - p.add_option( + p.add_argument( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) - p.add_option( + p.add_argument( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images" ) - p.add_option( + p.add_argument( "--images-as-html", dest="images_as_html", action="store_true", @@ -95,14 +90,14 @@ class bcolors: # pragma: no cover "and `alt` if possible." ) ) - p.add_option( + p.add_argument( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text" ) - p.add_option( + p.add_argument( "--images-with-size", dest="images_with_size", action="store_true", @@ -110,44 +105,42 @@ class bcolors: # pragma: no cover help="Write image tags with height and width attrs as raw html to " "retain dimensions" ) - p.add_option( + p.add_argument( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document" ) - p.add_option( + p.add_argument( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items" ) - p.add_option( + p.add_argument( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text" ) - p.add_option( + p.add_argument( "-b", "--body-width", dest="body_width", - action="store", - type="int", + type=int, default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap" ) - p.add_option( + p.add_argument( "-i", "--google-list-indent", dest="list_indent", - action="store", - type="int", + type=int, default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists" ) - p.add_option( + p.add_argument( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", @@ -155,7 +148,7 @@ class bcolors: # pragma: no cover help="hide strike-through text. only relevant when -g is " "specified as well" ) - p.add_option( + p.add_argument( "--escape-all", action="store_true", dest="escape_snob", @@ -163,14 +156,14 @@ class bcolors: # pragma: no cover help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues." ) - p.add_option( + p.add_argument( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax." ) - p.add_option( + p.add_argument( "--ignore-tables", action="store_true", dest="ignore_tables", @@ -178,7 +171,7 @@ class bcolors: # pragma: no cover help="Ignore table-related tags (table, th, td, tr) " "while keeping rows." ) - p.add_option( + p.add_argument( "--single-line-break", action="store_true", dest="single_line_break", @@ -188,152 +181,129 @@ class bcolors: # pragma: no cover "line breaks. NOTE: Requires --body-width=0" ) ) - p.add_option( + p.add_argument( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document" ) - p.add_option( + p.add_argument( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable" ) - p.add_option( + p.add_argument( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links" ) - p.add_option( + p.add_argument( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document" ) - p.add_option( + p.add_argument( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]" ) - p.add_option( + p.add_argument( "--decode-errors", dest="decode_errors", - action="store", - type="string", default=config.DECODE_ERRORS, help="What to do in case of decode errors.'ignore', 'strict' and " "'replace' are acceptable values" ) - p.add_option( + p.add_argument( "--open-quote", dest="open_quote", - action="store", - type="str", default=config.OPEN_QUOTE, help="The character used to open quotes", ) - p.add_option( + p.add_argument( "--close-quote", dest="close_quote", - action="store", - type="str", default=config.CLOSE_QUOTE, help="The character used to close quotes", ) - (options, args) = p.parse_args() - - # process input - encoding = "utf-8" - if len(args) == 2: - encoding = args[1] - elif len(args) > 2: - p.error('Too many arguments') - - if len(args) > 0 and args[0] != '-': # pragma: no cover - file_ = args[0] + p.add_argument( + '--version', + action='version', + version='.'.join(map(str, __version__)) + ) + p.add_argument('filename', nargs='?') + p.add_argument('encoding', nargs='?', default='utf-8') + args = p.parse_args() - if file_.startswith('http://') or file_.startswith('https://'): + if args.filename and args.filename != '-': # pragma: no cover + if (args.filename.startswith('http://') or + args.filename.startswith('https://')): warnings.warn( "Support for retrieving html over network is set for " "deprecation by version (2017, 1, x)", DeprecationWarning ) - baseurl = file_ + baseurl = args.filename j = urllib.urlopen(baseurl) data = j.read() - if encoding is None: - try: - from feedparser import _getCharacterEncoding as enc - except ImportError: - def enc(x, y): - return ('utf-8', 1) - encoding = enc(j.headers, data)[0] - if encoding == 'us-ascii': - encoding = 'utf-8' else: - data = open(file_, 'rb').read() - if encoding is None: - try: - from chardet import detect - except ImportError: - def detect(x): - return {'encoding': 'utf-8'} - encoding = detect(data)['encoding'] + with open(args.filename, 'rb') as fp: + data = fp.read() else: data = wrap_read() - if hasattr(data, 'decode'): - try: - data = data.decode(encoding, options.decode_errors) - except UnicodeDecodeError as err: - warning = bcolors.WARNING + "Warning:" + bcolors.ENDC - warning += ' Use the ' + bcolors.OKGREEN - warning += '--decode-errors=ignore' + bcolors.ENDC + ' flag.' - print(warning) - raise err + try: + data = data.decode(args.encoding, args.decode_errors) + except UnicodeDecodeError as err: + warning = bcolors.WARNING + "Warning:" + bcolors.ENDC + warning += ' Use the ' + bcolors.OKGREEN + warning += '--decode-errors=ignore' + bcolors.ENDC + ' flag.' + print(warning) + raise err h = HTML2Text(baseurl=baseurl) # handle options - if options.ul_style_dash: + if args.ul_style_dash: h.ul_item_mark = '-' - if options.em_style_asterisk: + if args.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' - h.body_width = options.body_width - h.google_list_indent = options.list_indent - h.ignore_emphasis = options.ignore_emphasis - h.ignore_links = options.ignore_links - h.protect_links = options.protect_links - h.ignore_images = options.ignore_images - h.images_as_html = options.images_as_html - h.images_to_alt = options.images_to_alt - h.images_with_size = options.images_with_size - h.google_doc = options.google_doc - h.hide_strikethrough = options.hide_strikethrough - h.escape_snob = options.escape_snob - h.bypass_tables = options.bypass_tables - h.ignore_tables = options.ignore_tables - h.single_line_break = options.single_line_break - h.inline_links = options.inline_links - h.unicode_snob = options.unicode_snob - h.use_automatic_links = options.use_automatic_links - h.skip_internal_links = options.skip_internal_links - h.links_each_paragraph = options.links_each_paragraph - h.mark_code = options.mark_code - h.wrap_links = options.wrap_links - h.wrap_list_items = options.wrap_list_items - h.pad_tables = options.pad_tables - h.default_image_alt = options.default_image_alt - h.open_quote = options.open_quote - h.close_quote = options.close_quote + h.body_width = args.body_width + h.google_list_indent = args.list_indent + h.ignore_emphasis = args.ignore_emphasis + h.ignore_links = args.ignore_links + h.protect_links = args.protect_links + h.ignore_images = args.ignore_images + h.images_as_html = args.images_as_html + h.images_to_alt = args.images_to_alt + h.images_with_size = args.images_with_size + h.google_doc = args.google_doc + h.hide_strikethrough = args.hide_strikethrough + h.escape_snob = args.escape_snob + h.bypass_tables = args.bypass_tables + h.ignore_tables = args.ignore_tables + h.single_line_break = args.single_line_break + h.inline_links = args.inline_links + h.unicode_snob = args.unicode_snob + h.use_automatic_links = args.use_automatic_links + h.skip_internal_links = args.skip_internal_links + h.links_each_paragraph = args.links_each_paragraph + h.mark_code = args.mark_code + h.wrap_links = args.wrap_links + h.wrap_list_items = args.wrap_list_items + h.pad_tables = args.pad_tables + h.default_image_alt = args.default_image_alt + h.open_quote = args.open_quote + h.close_quote = args.close_quote wrapwrite(h.handle(data)) diff --git a/html2text/config.py b/html2text/config.py index d6db8efe..7205e439 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -14,7 +14,7 @@ # Put the links after each paragraph instead of at the end. LINKS_EACH_PARAGRAPH = 0 -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +# Wrap long lines at position. 0 for no wrapping. BODY_WIDTH = 78 # Don't show internal links (href="#local-anchor") -- corresponding link diff --git a/html2text/utils.py b/html2text/utils.py index 0f15469a..fb3f6fe5 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -12,7 +12,7 @@ def name2cp(k): unifiable_n = {} -for k in config.UNIFIABLE.keys(): +for k in config.UNIFIABLE: unifiable_n[name2cp(k)] = config.UNIFIABLE[k] @@ -30,12 +30,10 @@ def dumb_property_dict(style): """ :returns: A hash of css attributes """ - out = dict([(x.strip().lower(), y.strip().lower()) for x, y in - [z.split(':', 1) for z in - style.split(';') if ':' in z - ] - ] - ) + out = { + x.strip().lower(): y.strip().lower() + for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z] + } return out @@ -59,8 +57,7 @@ def dumb_css_parser(data): # support older pythons elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] try: - elements = dict([(a.strip(), dumb_property_dict(b)) - for a, b in elements]) + elements = {a.strip(): dumb_property_dict(b) for a, b in elements} except ValueError: # pragma: no cover elements = {} # not that important diff --git a/requirements-dev.txt b/requirements-dev.txt index 7288811a..5f7b544e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,4 @@ coverage==4.4.2 py==1.5.2 -pypandoc==1.4 wheel==0.30.0 flake8==3.5.0 diff --git a/setup.py b/setup.py index 2541f542..143a9162 100644 --- a/setup.py +++ b/setup.py @@ -1,65 +1,23 @@ # coding: utf-8 -import sys +from setuptools import setup -from setuptools import setup, Command, find_packages - -def read_md_convert(f): - return convert(f, 'rst') - - -def read_md_open(f): - return open(f, 'r').read() - - -try: - from pypandoc import convert - read_md = read_md_convert -except ImportError: - read_md = read_md_open - -requires_list = [] -try: - import unittest2 as unittest -except ImportError: - import unittest -else: - if sys.version_info <= (2, 6): - requires_list.append("unittest2") - - -class RunTests(Command): - """ - New setup.py command to run all tests for the package. - """ - description = "run all tests for the package" - - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - tests = unittest.TestLoader().discover('.') - runner = unittest.TextTestRunner() - results = runner.run(tests) - sys.exit(not results.wasSuccessful()) +def readall(f): + with open(f) as fp: + return fp.read() setup( name="html2text", version=".".join(map(str, __import__('html2text').__version__)), description="Turn HTML into equivalent Markdown-structured text.", - long_description=read_md('README.md'), + long_description=readall('README.md'), + long_description_content_type="text/markdown", author="Aaron Swartz", author_email="me@aaronsw.com", maintainer='Alireza Savand', maintainer_email='alireza.savand@gmail.com', url='https://github.com/Alir3z4/html2text/', - cmdclass={'test': RunTests}, platforms='OS Independent', classifiers=[ 'Development Status :: 5 - Production/Stable', @@ -68,25 +26,23 @@ def run(self): 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', ], - entry_points=""" - [console_scripts] - html2text=html2text.cli:main - """, + python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", + entry_points={ + 'console_scripts': [ + 'html2text = html2text.cli:main', + ] + }, license='GNU GPL 3', - requires=requires_list, - packages=find_packages(exclude=['test']), + packages=['html2text'], include_package_data=True, zip_safe=False, ) diff --git a/test/test_html2text.py b/test/test_html2text.py index 9b7e735a..398d5430 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -6,11 +6,7 @@ import re import subprocess import sys - -if sys.version_info[:2] < (2, 7): - import unittest2 as unittest -else: - import unittest +import unittest logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', @@ -79,7 +75,7 @@ def test_function(fn, **kwargs): def get_dump_name(fn, suffix): - return '%s-%s_output.md' % (os.path.splitext(fn)[0], suffix) + return '{}-{}_output.md'.format(os.path.splitext(fn)[0], suffix) def get_baseline_name(fn): @@ -255,6 +251,3 @@ def _test_func(self): setattr(TestHTML2Text, test_name + "_cmd", test_c) if test_func: setattr(TestHTML2Text, test_name + "_func", test_func) - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_memleak.py b/test/test_memleak.py index 737999c0..03b9170a 100644 --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -1,10 +1,6 @@ import html2text import logging -import sys -if sys.version_info[:2] < (2, 7): - import unittest2 as unittest -else: - import unittest +import unittest logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', From c7c2ef5229251af2c88610cb8a886f1a3ce6ec22 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 23 Feb 2019 10:07:22 -0800 Subject: [PATCH 392/479] Remove unused py from requirements-dev.txt --- requirements-dev.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 7288811a..488eb7de 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,4 @@ coverage==4.4.2 -py==1.5.2 pypandoc==1.4 wheel==0.30.0 flake8==3.5.0 From de2020633cd4b163cd7266e5f54aeca0872f124c Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 25 Feb 2019 11:19:49 -0800 Subject: [PATCH 393/479] Fix flake8 warning by using a short link --- test/test_html2text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index b3cb3967..fde0a842 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -240,7 +240,7 @@ def _test_func(self): return _test_mod, test_cmd, test_func -# Originally from https://stackoverflow.com/questions/32899/how-do-you-generate-dynamic-parameterized-unit-tests-in-python +# Originally from https://stackoverflow.com/q/32899 # how-to-generate-dynamic-parametrized-unit-tests-in-python test_dir_name = os.path.dirname(os.path.realpath(__file__)) for fn in glob.glob("%s/*.html" % test_dir_name): From 1435436f7bc18d99f5b36f44d345496c282eddba Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 23 Feb 2019 07:53:38 -0800 Subject: [PATCH 394/479] Remove unused and unnecessary logging from tests --- test/test_html2text.py | 5 ----- test/test_memleak.py | 5 ----- 2 files changed, 10 deletions(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index fde0a842..d296a7ba 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -1,7 +1,6 @@ import codecs import glob import html2text -import logging import os import re import subprocess @@ -9,10 +8,6 @@ import unittest -logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', - level=logging.DEBUG) - - def cleanup_eol(clean_str): if os.name == 'nt' or sys.platform == 'cygwin': # Fix the unwanted CR to CRCRLF replacement diff --git a/test/test_memleak.py b/test/test_memleak.py index 03b9170a..442692c3 100644 --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -1,12 +1,7 @@ import html2text -import logging import unittest -logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', - level=logging.DEBUG) - - class TestMemleak(unittest.TestCase): """ See https://github.com/Alir3z4/html2text/issues/13 for more From 22b2a43ad6118f041b40ef79fabf83d98a665acf Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 23 Feb 2019 10:18:51 -0800 Subject: [PATCH 395/479] Remove unnecessary `description-file = README.md` from setup.cfg setuptools include README.md as the description by default. For details, see: https://github.com/pypa/setuptools/blob/6575418283ec605f6bca82c525f6680d897ce282/setuptools/command/sdist.py#L40-L41 --- setup.cfg | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 1eee7db0..2a9acf13 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,2 @@ [bdist_wheel] universal = 1 - -[metadata] -description-file = README.md From b6965a73874107640260019cfeb7256eed5cbc15 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 23 Feb 2019 06:32:43 -0800 Subject: [PATCH 396/479] Remove long deprecated support for retrieving HTML over the network The feature was deprecated in commit 5672e2dd2c5bf918275acaf1142b36beba1b9ac5 (2016-05-27) and was originally slated for removal in 2017. Fixes #117 --- ChangeLog.rst | 1 + README.md | 2 +- html2text/cli.py | 17 ++--------------- html2text/compat.py | 4 +--- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 9e40057d..981cb139 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -6,6 +6,7 @@ * Fix #201: handle ‎/‏ marks mid-text within stressed tags or right after stressed tags. * Feature #213: `images_as_html` config option to always generate an `img` html tag. preserves "height", "width" and "alt" if possible. * Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. +* Remove support for retrieving HTML over the network. 2018.1.9 ======== diff --git a/README.md b/README.md index 72b1928f..21b1f06c 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). -Usage: `html2text [(filename|url) [encoding]]` +Usage: `html2text [filename [encoding]]` | Option | Description |--------------------------------------------------------|--------------------------------------------------- diff --git a/html2text/cli.py b/html2text/cli.py index 78ece65a..25457ba1 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,7 +1,5 @@ import argparse -import warnings -from html2text.compat import urllib from html2text import HTML2Text, config, __version__ from html2text.utils import wrapwrite, wrap_read @@ -245,19 +243,8 @@ class bcolors: # pragma: no cover args = p.parse_args() if args.filename and args.filename != '-': # pragma: no cover - if (args.filename.startswith('http://') or - args.filename.startswith('https://')): - warnings.warn( - "Support for retrieving html over network is set for " - "deprecation by version (2017, 1, x)", - DeprecationWarning - ) - baseurl = args.filename - j = urllib.urlopen(baseurl) - data = j.read() - else: - with open(args.filename, 'rb') as fp: - data = fp.read() + with open(args.filename, 'rb') as fp: + data = fp.read() else: data = wrap_read() diff --git a/html2text/compat.py b/html2text/compat.py index f669f980..9130a7ba 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -5,17 +5,15 @@ import htmlentitydefs import urlparse import HTMLParser - import urllib from cgi import escape as html_escape else: import urllib.parse as urlparse import html.entities as htmlentitydefs import html.parser as HTMLParser - import urllib.request as urllib from html import escape def html_escape(s): return escape(s, quote=False) -__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urllib', 'urlparse'] +__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urlparse'] From 5877371302cd604767052e4fc77db837e6a8076f Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 25 Feb 2019 13:26:22 -0800 Subject: [PATCH 397/479] Add a __main__.py file to the package Allows executing the command line interface with Python's -m option: python -m html2text ... --- ChangeLog.rst | 1 + html2text/__init__.py | 6 ------ html2text/__main__.py | 3 +++ setup.py | 1 + test/test_html2text.py | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) create mode 100644 html2text/__main__.py diff --git a/ChangeLog.rst b/ChangeLog.rst index 981cb139..4255ef88 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -7,6 +7,7 @@ * Feature #213: `images_as_html` config option to always generate an `img` html tag. preserves "height", "width" and "alt" if possible. * Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. * Remove support for retrieving HTML over the network. +* Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. 2018.1.9 ======== diff --git a/html2text/__init__.py b/html2text/__init__.py index 333906c5..52806fe1 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -954,9 +954,3 @@ def unescape(s, unicode_snob=False): h.unicode_snob = unicode_snob return h.unescape(s) - - -if __name__ == "__main__": - from html2text.cli import main - - main() diff --git a/html2text/__main__.py b/html2text/__main__.py new file mode 100644 index 00000000..6a5a4f85 --- /dev/null +++ b/html2text/__main__.py @@ -0,0 +1,3 @@ +from html2text.cli import main + +main() diff --git a/setup.py b/setup.py index 143a9162..7708f255 100644 --- a/setup.py +++ b/setup.py @@ -45,4 +45,5 @@ def readall(f): packages=['html2text'], include_package_data=True, zip_safe=False, + test_suite='test', ) diff --git a/test/test_html2text.py b/test/test_html2text.py index d296a7ba..7ecd5b50 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -40,7 +40,7 @@ def test_module(fn, google_doc=False, **kwargs): def test_command(fn, *args): args = list(args) - cmd = [sys.executable, '-m', 'html2text.__init__'] + cmd = [sys.executable, '-m', 'html2text'] if '--googledoc' in args: args.remove('--googledoc') From 69136508fa33e91cf773929022838c5131a9ca13 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 25 Feb 2019 19:26:49 -0800 Subject: [PATCH 398/479] Remove unused options from setup.py Remove include_package_data. The package does not contain any data files, only Python files. Remove zip_safe. As the package does not contains any data files, it is zip safe. --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 7708f255..e705a95a 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,5 @@ def readall(f): }, license='GNU GPL 3', packages=['html2text'], - include_package_data=True, - zip_safe=False, test_suite='test', ) From 17b4f29b8a89a21d9cee74c4fb1896bec6909df9 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 25 Feb 2019 19:49:53 -0800 Subject: [PATCH 399/479] Remove unused get_dump_name() Unused since 9125b6c5261f5cb5fde5b9d22ee121d2920813bf. Nothing dumps output files. --- .gitignore | 1 - test/test_html2text.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 3881b4cc..b7ecb128 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ *.py[co] *.bak -test/*output.md build dist *.egg-info diff --git a/test/test_html2text.py b/test/test_html2text.py index 7ecd5b50..c1cd4e5d 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -69,10 +69,6 @@ def test_function(fn, **kwargs): return result, actual -def get_dump_name(fn, suffix): - return '{}-{}_output.md'.format(os.path.splitext(fn)[0], suffix) - - def get_baseline_name(fn): return os.path.splitext(fn)[0] + '.md' From 61d668fe50daaa3c16912f6a11947830f26e568e Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 26 Feb 2019 06:42:34 -0800 Subject: [PATCH 400/479] Enable pip cache in Travis CI Reduce load on PyPI servers and slightly speed up builds. For more information, see: https://docs.travis-ci.com/user/caching/#pip-cache --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 95c71d03..d0e02bf8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,6 @@ dist: xenial language: python +cache: pip python: - "2.7" - "3.4" From 2d3ef004c473475275fb5a4c7ec5db2b40841120 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 25 Feb 2019 20:52:50 -0800 Subject: [PATCH 401/479] Use False (not 0) in config.py --- html2text/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html2text/config.py b/html2text/config.py index 7205e439..6c5506a8 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -3,16 +3,16 @@ import re # Use Unicode characters instead of their ascii pseudo-replacements -UNICODE_SNOB = 0 +UNICODE_SNOB = False # Marker to use for marking tables for padding post processing TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding" # Escape all special characters. Output is less readable, but avoids # corner case formatting issues. -ESCAPE_SNOB = 0 +ESCAPE_SNOB = False # Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 +LINKS_EACH_PARAGRAPH = False # Wrap long lines at position. 0 for no wrapping. BODY_WIDTH = 78 From 46ecef3afb05d281f377de2ef07d4c17cc29ae6e Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 26 Feb 2019 17:43:22 -0800 Subject: [PATCH 402/479] Fix extra space after a stressed tag followed by an entity Fixes #237 --- ChangeLog.rst | 1 + html2text/__init__.py | 12 ++++++------ test/stressed_with_html_entities.html | 1 + test/stressed_with_html_entities.md | 2 ++ test/test_html2text.py | 7 ++++++- 5 files changed, 16 insertions(+), 7 deletions(-) create mode 100644 test/stressed_with_html_entities.html create mode 100644 test/stressed_with_html_entities.md diff --git a/ChangeLog.rst b/ChangeLog.rst index 4255ef88..e4f69203 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -8,6 +8,7 @@ * Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. * Remove support for retrieving HTML over the network. * Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. +* Fix #237: correct spacing when a HTML entity follows a non-stressed tags which follow a stressed tag. 2018.1.9 ======== diff --git a/html2text/__init__.py b/html2text/__init__.py index 52806fe1..f921b5be 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -807,12 +807,12 @@ def handle_data(self, data, entity_char=False): data = data.strip() self.stressed = False self.preceding_stressed = True - elif (self.preceding_stressed - and re.match(r'[^\s.!?]', data[0]) - and not hn(self.current_tag) - and self.current_tag not in ['a', 'code', 'pre']): - # should match a letter or common punctuation - data = ' ' + data + elif self.preceding_stressed: + if (re.match(r'[^\s.!?]', data[0]) + and not hn(self.current_tag) + and self.current_tag not in ['a', 'code', 'pre']): + # should match a letter or common punctuation + data = ' ' + data self.preceding_stressed = False if self.style: diff --git a/test/stressed_with_html_entities.html b/test/stressed_with_html_entities.html new file mode 100644 index 00000000..de925e2b --- /dev/null +++ b/test/stressed_with_html_entities.html @@ -0,0 +1 @@ +

    hello world ><

    diff --git a/test/stressed_with_html_entities.md b/test/stressed_with_html_entities.md new file mode 100644 index 00000000..6c59c1e9 --- /dev/null +++ b/test/stressed_with_html_entities.md @@ -0,0 +1,2 @@ +**hello** world >< + diff --git a/test/test_html2text.py b/test/test_html2text.py index c1cd4e5d..8f746b89 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -215,7 +215,12 @@ def _test_func(self): module_args['wrap_list_items'] = True cmdline_args.append('--wrap-list-items') - if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: + func_compat = [ + 'abbr_tag.html', + 'bodywidth_newline.html', + 'stressed_with_html_entities.html' + ] + if base_fn not in func_compat: test_func = None else: test_func = _test_func From bf87d8a2442c230c7f0559fcc08d0b79af451aa2 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 26 Feb 2019 18:16:06 -0800 Subject: [PATCH 403/479] Add tox.ini to easily test all platforms locally To test all platforms locally, run the single command: tox --- .gitignore | 2 ++ .travis.yml | 40 +++++++++++++++++++++++----------------- MANIFEST.in | 1 + README.md | 3 +-- docs/test.md | 4 +--- tox.ini | 20 ++++++++++++++++++++ 6 files changed, 48 insertions(+), 22 deletions(-) create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index b7ecb128..a219c128 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ dist env/ .c9/ .vscode +.tox/ +htmlcov/ diff --git a/.travis.yml b/.travis.yml index d0e02bf8..4dfbf9e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,23 +1,29 @@ dist: xenial language: python cache: pip -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - "3.7" - - "pypy2.7-6.0" - - "pypy3.5-6.0" + +matrix: + include: + - env: TOXENV=flake8 + - python: "2.7" + env: TOXENV=py27 + - python: "3.4" + env: TOXENV=py34 + - python: "3.5" + env: TOXENV=py35 + - python: "3.6" + env: TOXENV=py36 + - python: "3.7" + env: TOXENV=py37 + - python: "pypy2.7-6.0" + env: TOXENV=pypy + - python: "pypy3.5-6.0" + env: TOXENV=pypy3 + install: - - pip install coveralls -before_script: - - pip install flake8 - - flake8 - - export COVERAGE_PROCESS_START=$PWD/.coveragerc + - pip install tox script: - - coverage run --source=html2text setup.py test - - coverage combine - - coverage report + - tox after_success: - coveralls + - pip install coveralls + - coveralls diff --git a/MANIFEST.in b/MANIFEST.in index 6217360c..28e28387 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,4 +2,5 @@ include COPYING include README.md include ChangeLog.rst include AUTHORS.rst +include tox.ini recursive-include test *.html *.md *.py diff --git a/README.md b/README.md index 21b1f06c..d7d19ed6 100644 --- a/README.md +++ b/README.md @@ -73,11 +73,10 @@ $ pip install html2text ## How to run unit tests - coverage run --source=html2text setup.py test + tox To see the coverage results: - coverage combine coverage html then open the `./htmlcov/index.html` file in your browser. diff --git a/docs/test.md b/docs/test.md index 1ff19d91..22c64111 100644 --- a/docs/test.md +++ b/docs/test.md @@ -6,12 +6,10 @@ Testing is essential. Run the tests ------------- -`PYTHONPATH=$PYTHONPATH:. coverage run --source=html2text setup.py test -v` +`tox` Coverage results can be seen with -`coverage combine` - `coverage html` and then opening the `./htmlcov/index.html` file with your browser. diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..ca4e8c42 --- /dev/null +++ b/tox.ini @@ -0,0 +1,20 @@ +[tox] +envlist = + flake8 + py{27,py34,py35,py36,py,py3} +minversion = 1.9 + +[testenv] +commands = + coverage run setup.py test + coverage combine + coverage report +deps = + coverage + +[testenv:flake8] +commands = + flake8 +deps = + flake8 +skip_install = true From e0c29574a8958a720c0e12550c40d39d5d41db69 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 07:12:02 -0800 Subject: [PATCH 404/479] Add myself as a contributor --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index ebda6d18..3ec70c5b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -30,6 +30,7 @@ The AUTHORS/Contributors are (and/or have been): * Germain Z. * Jacek Kołodziej * Jonathan Vanasco +* Jon Dufresne Maintainer: From 975819b47d27dcfaf766acf4a5192878f75c5662 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 07:17:49 -0800 Subject: [PATCH 405/479] Run all argument-less tests as a func test Helps ensure the fun interface works as expected. --- test/test_html2text.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index 8f746b89..47d8330a 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -137,35 +137,43 @@ def _test_func(self): module_args = {} cmdline_args = [] func_args = {} + skip_func = False base_fn = os.path.basename(fn).lower() if base_fn.startswith('default_image_alt'): module_args['default_image_alt'] = 'Image' cmdline_args.append('--default-image-alt=Image') + skip_func = True if base_fn.startswith('google'): module_args['google_doc'] = True cmdline_args.append('--googledoc') + skip_func = True if base_fn.find('unicode') >= 0: module_args['unicode_snob'] = True + skip_func = True if base_fn.find('flip_emphasis') >= 0: module_args['emphasis_mark'] = '*' module_args['strong_mark'] = '__' cmdline_args.append('-e') + skip_func = True if base_fn.find('escape_snob') >= 0: module_args['escape_snob'] = True cmdline_args.append('--escape-all') + skip_func = True if base_fn.find('table_bypass') >= 0: module_args['bypass_tables'] = True cmdline_args.append('--bypass-tables') + skip_func = True if base_fn.startswith('table_ignore'): module_args['ignore_tables'] = True cmdline_args.append('--ignore-tables') + skip_func = True if base_fn.startswith('bodywidth'): # module_args['unicode_snob'] = True @@ -176,63 +184,70 @@ def _test_func(self): if base_fn.startswith('protect_links'): module_args['protect_links'] = True cmdline_args.append('--protect-links') + skip_func = True if base_fn.startswith('images_as_html'): module_args['images_as_html'] = True cmdline_args.append('--images-as-html') + skip_func = True if base_fn.startswith('images_to_alt'): module_args['images_to_alt'] = True cmdline_args.append('--images-to-alt') + skip_func = True if base_fn.startswith('images_with_size'): module_args['images_with_size'] = True cmdline_args.append('--images-with-size') + skip_func = True if base_fn.startswith('single_line_break'): module_args['body_width'] = 0 cmdline_args.append('--body-width=0') module_args['single_line_break'] = True cmdline_args.append('--single-line-break') + skip_func = True if base_fn.startswith('no_inline_links'): module_args['inline_links'] = False cmdline_args.append('--reference-links') + skip_func = True if base_fn.startswith('no_wrap_links'): module_args['wrap_links'] = False cmdline_args.append('--no-wrap-links') + skip_func = True if base_fn.startswith('mark_code'): module_args['mark_code'] = True cmdline_args.append('--mark-code') + skip_func = True if base_fn.startswith('pad_table'): module_args['pad_tables'] = True cmdline_args.append('--pad-tables') + skip_func = True if base_fn.startswith('wrap_list_items'): module_args['wrap_list_items'] = True cmdline_args.append('--wrap-list-items') - - func_compat = [ - 'abbr_tag.html', - 'bodywidth_newline.html', - 'stressed_with_html_entities.html' - ] - if base_fn not in func_compat: - test_func = None - else: - test_func = _test_func + skip_func = True if base_fn == 'inplace_baseurl_substitution.html': module_args['baseurl'] = 'http://brettterpstra.com' module_args['body_width'] = 0 + func_args['baseurl'] = 'http://brettterpstra.com' + func_args['bodywidth'] = 0 # there is no way to specify baseurl in cli :( test_cmd = None else: test_cmd = _test_cmd + if skip_func: + test_func = None + else: + test_func = _test_func + return _test_mod, test_cmd, test_func From 6de63e14b7caed03fbd9858d9e9ee49293c02bc7 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 17:52:16 -0800 Subject: [PATCH 406/479] Correct fixed-space literal rst syntax Requires two back-quotes, not one. http://docutils.sourceforge.net/docs/user/rst/quickstart.html#text-styles --- ChangeLog.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index e4f69203..6ee928b0 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,7 +4,7 @@ * Add support for wrapping list items. * Fix #201: handle ‎/‏ marks mid-text within stressed tags or right after stressed tags. -* Feature #213: `images_as_html` config option to always generate an `img` html tag. preserves "height", "width" and "alt" if possible. +* Feature #213: ``images_as_html`` config option to always generate an ``img`` html tag. preserves "height", "width" and "alt" if possible. * Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. * Remove support for retrieving HTML over the network. * Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. @@ -29,7 +29,7 @@ * Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build * Fix #109: Fix for unexpanded < > & * Fix #143: Fix line wrapping for the lines starting with bold -* Adds support for numeric bold text indication in `font-weight`, +* Adds support for numeric bold text indication in ``font-weight``, as used by Google (and presumably others.) * Fix #173 and #142: Stripping whitespace in crucial markdown and adding whitespace as necessary * Don't drop any cell data on tables uneven row lengths (e.g. colspan in use) @@ -80,8 +80,8 @@ ========= ---- -* Fix #38: Long links wrapping controlled by `--no-wrap-links`. -* Note: `--no-wrap-links` implies `--reference-links` +* Fix #38: Long links wrapping controlled by ``--no-wrap-links``. +* Note: ``--no-wrap-links`` implies ``--reference-links`` * Feature #83: Add callback-on-tag. * Fix #87: Decode errors can be handled via command line. * Feature #95: Docs, decode errors spelling mistake. @@ -143,7 +143,7 @@ ========= ---- -* Fix #38: Anchor tags with empty text or with `` tags inside are no longer stripped. +* Fix #38: Anchor tags with empty text or with ```` tags inside are no longer stripped. 2014.12.29 @@ -170,8 +170,8 @@ ========= ---- -* Feature: Update `README.md` with usage examples. -* Fix #35: Remove `py_modules` from `setup.py`. +* Feature: Update ``README.md`` with usage examples. +* Fix #35: Remove ``py_modules`` from ``setup.py``. * Fix #36: Excludes tests from being installed as a separate module. * Fix #37: Don't hardcode the path to the installed binary. * Fix: Readme typo in running cli. From b616e5b7f372538ab3afaaaf94d6969ddd2cf93c Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 18:07:17 -0800 Subject: [PATCH 407/479] Correct default tox matrix Previously was missing py37 and some targets had too many "py" substrings. --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index ca4e8c42..e6201ad5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,7 @@ [tox] envlist = flake8 - py{27,py34,py35,py36,py,py3} + py{27,34,35,36,37,py,py3} minversion = 1.9 [testenv] From 5443b50490cb11f3b964674b80e7f92d38be0d08 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 26 Feb 2019 19:00:12 -0800 Subject: [PATCH 408/479] Remove unused unescape() function and related support code --- ChangeLog.rst | 6 ++++++ docs/how_it_works.md | 3 --- docs/usage.md | 1 - html2text/__init__.py | 17 ----------------- html2text/compat.py | 8 +------- html2text/config.py | 1 - test/test_html2text.py | 15 --------------- 7 files changed, 7 insertions(+), 44 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 6ee928b0..0041dcd7 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -9,6 +9,12 @@ * Remove support for retrieving HTML over the network. * Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. * Fix #237: correct spacing when a HTML entity follows a non-stressed tags which follow a stressed tag. +* Remove unused or deprecated: + * ``html2text.compat.escape()`` + * ``html2text.config.RE_UNESCAPE`` + * ``html2text.HTML2Text.replaceEntities()`` + * ``html2text.HTML2Text.unescape()`` + * ``html2text.unescape()`` 2018.1.9 ======== diff --git a/docs/how_it_works.md b/docs/how_it_works.md index b9ef81ee..ef7d850f 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -36,7 +36,6 @@ Used to provide various configuration settings to the converter. They are as fol - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII values - RE_SPACE for finding space-only lines - - RE_UNESCAPE for finding html entities like   - RE_ORDERED_LIST_MATCHER for matching ordered lists in MD - RE_UNORDERED_LIST_MATCHER for matching unordered list matcher in MD - RE_MD_CHARS_MATCHER for matching Md \,[,],( and ) @@ -133,8 +132,6 @@ The class defines methods: - unknown_decl - charref - entityref - - replaceEntities - - unescape - google_nest_count - optwrap diff --git a/docs/usage.md b/docs/usage.md index 4cbb232b..a1758d30 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -82,7 +82,6 @@ simple indications of their function. - UNIFIABLE is a dictionary which maps unicode abbreviations to ASCII values - RE_SPACE for finding space-only lines - - RE_UNESCAPE for finding html entities like   - RE_ORDERED_LIST_MATCHER for matching ordered lists in MD - RE_UNORDERED_LIST_MATCHER for matching unordered list matcher in MD - RE_MD_CHARS_MATCHER for matching Md \,[,],( and ) diff --git a/html2text/__init__.py b/html2text/__init__.py index f921b5be..cb85123f 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -867,16 +867,6 @@ def entityref(self, c): else: return chr(name2cp(c)) - def replaceEntities(self, s): - s = s.group(1) - if s[0] == "#": - return self.charref(s[1:]) - else: - return self.entityref(s) - - def unescape(self, s): - return config.RE_UNESCAPE.sub(self.replaceEntities, s) - def google_nest_count(self, style): """ Calculate the nesting count of google doc lists @@ -947,10 +937,3 @@ def html2text(html, baseurl='', bodywidth=None): h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) return h.handle(html) - - -def unescape(s, unicode_snob=False): - h = HTML2Text() - h.unicode_snob = unicode_snob - - return h.unescape(s) diff --git a/html2text/compat.py b/html2text/compat.py index 9130a7ba..e19baef2 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -5,15 +5,9 @@ import htmlentitydefs import urlparse import HTMLParser - from cgi import escape as html_escape else: import urllib.parse as urlparse import html.entities as htmlentitydefs import html.parser as HTMLParser - from html import escape - def html_escape(s): - return escape(s, quote=False) - - -__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urlparse'] +__all__ = ['HTMLParser', 'htmlentitydefs', 'urlparse'] diff --git a/html2text/config.py b/html2text/config.py index 6c5506a8..e2a04f9d 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -57,7 +57,6 @@ # For checking space-only lines on line 771 RE_SPACE = re.compile(r'\s\+') -RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s') RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") diff --git a/test/test_html2text.py b/test/test_html2text.py index 47d8330a..8f3f41b4 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -82,21 +82,6 @@ def get_baseline(fn): class TestHTML2Text(unittest.TestCase): - - def test_html_escape(self): - self.assertEqual( - html2text.compat.html_escape('
    and then
    & other tags'), - '<pre>and then<div> & other tags' - ) - - def test_unescape(self): - self.assertEqual( - '
    and then
    & other tags', - html2text.unescape( - '<pre>and then<div> & other tags' - ) - ) - def _skip_certain_tags(self, h2t, tag, attrs, start): if tag == 'b': return True From c8c2bccd4176088a453245af7ff09742fa8cfbb8 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 18:03:26 -0800 Subject: [PATCH 409/479] Simplify subprocess call in tests with subprocess.check_output() --- test/test_html2text.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index 8f3f41b4..a48b8c5d 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -52,8 +52,7 @@ def test_command(fn, *args): cmd += [fn] result = get_baseline(fn) - pid = subprocess.Popen(cmd, stdout=subprocess.PIPE) - out, _ = pid.communicate() + out = subprocess.check_output(cmd) actual = out.decode('utf8') From 0fbb8343d6988b0523fba493f7c9ba2b9d471226 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 21:37:55 -0800 Subject: [PATCH 410/479] Remove unused method unknown_decl The function is empty, same as the parent class's implementation. Just use inheritance rather than re-defining it. --- docs/how_it_works.md | 1 - html2text/__init__.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/docs/how_it_works.md b/docs/how_it_works.md index ef7d850f..eed042b1 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -129,7 +129,6 @@ The class defines methods: - soft_br - o - handle_data - - unknown_decl - charref - entityref - google_nest_count diff --git a/html2text/__init__.py b/html2text/__init__.py index cb85123f..6569f75c 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -835,10 +835,6 @@ def handle_data(self, data, entity_char=False): self.preceding_data = data self.o(data, 1) - def unknown_decl(self, data): # pragma: no cover - # TODO: what is this doing here? - pass - def charref(self, name): if name[0] in ['x', 'X']: c = int(name[1:], 16) From c84c168e535b42e3434c3a6d4f23b5ba45fcb297 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Thu, 28 Feb 2019 17:48:40 -0800 Subject: [PATCH 411/479] Fix ticket number in changelog --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 0041dcd7..d8a943fd 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -8,7 +8,7 @@ * Remove support for end-of-life Pythons. Now requires Python 2.7 or 3.4+. * Remove support for retrieving HTML over the network. * Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. -* Fix #237: correct spacing when a HTML entity follows a non-stressed tags which follow a stressed tag. +* Fix #238: correct spacing when a HTML entity follows a non-stressed tags which follow a stressed tag. * Remove unused or deprecated: * ``html2text.compat.escape()`` * ``html2text.config.RE_UNESCAPE`` From 57de7529195e0127a0cfbf1567859754f0e71620 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 27 Feb 2019 21:53:09 -0800 Subject: [PATCH 412/479] Handle LEFT-TO-RIGHT MARK after a `` tag Fixes #208 --- ChangeLog.rst | 1 + html2text/__init__.py | 5 +++++ test/lrm_after_b.html | 1 + test/lrm_after_b.md | 2 ++ 4 files changed, 9 insertions(+) create mode 100644 test/lrm_after_b.html create mode 100644 test/lrm_after_b.md diff --git a/ChangeLog.rst b/ChangeLog.rst index d8a943fd..08907ded 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -15,6 +15,7 @@ * ``html2text.HTML2Text.replaceEntities()`` * ``html2text.HTML2Text.unescape()`` * ``html2text.unescape()`` +* Fix #208: handle LEFT-TO-RIGHT MARK after a stressed tag. 2018.1.9 ======== diff --git a/html2text/__init__.py b/html2text/__init__.py index 6569f75c..e189c663 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -803,6 +803,11 @@ def o(self, data, puredata=0, force=0): self.outcount += 1 def handle_data(self, data, entity_char=False): + if not data: + # Data may be empty for some HTML entities. For example, + # LEFT-TO-RIGHT MARK. + return + if self.stressed: data = data.strip() self.stressed = False diff --git a/test/lrm_after_b.html b/test/lrm_after_b.html new file mode 100644 index 00000000..89932aa3 --- /dev/null +++ b/test/lrm_after_b.html @@ -0,0 +1 @@ +b‎ diff --git a/test/lrm_after_b.md b/test/lrm_after_b.md new file mode 100644 index 00000000..9c875b21 --- /dev/null +++ b/test/lrm_after_b.md @@ -0,0 +1,2 @@ +**b** + From fd669a7ea37e675f87c36d379489fb78cebd4d37 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 26 Feb 2019 21:00:42 -0800 Subject: [PATCH 413/479] Use pytest to run the test suite Better handles coverage allowing the removal of the coveragerc and sitecustomize.py files. Has builtin support for parametrized data. Has a visually nicer test runner and error messages. --- .coveragerc | 23 --- setup.py | 1 - sitecustomize.py | 9 -- test/test_html2text.py | 332 ++++++++++++++++++++--------------------- test/test_memleak.py | 31 ++-- tox.ini | 7 +- 6 files changed, 174 insertions(+), 229 deletions(-) delete mode 100644 .coveragerc delete mode 100644 sitecustomize.py diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 11441c9c..00000000 --- a/.coveragerc +++ /dev/null @@ -1,23 +0,0 @@ -[run] -branch=True -parallel=True -cover_pylib=False -source=./html2text - -[report] -# Regexes for lines to exclude from consideration -exclude_lines = - # Have to re-enable the standard pragma - pragma: no cover - - # Don't complain about missing debug-only code: - def __repr__ - if self\.debug - - # Don't complain if tests don't hit defensive assertion code: - raise AssertionError - raise NotImplementedError - - # Don't complain if non-runnable code isn't run: - if 0: - if __name__ == .__main__.: diff --git a/setup.py b/setup.py index e705a95a..ce3069c5 100644 --- a/setup.py +++ b/setup.py @@ -43,5 +43,4 @@ def readall(f): }, license='GNU GPL 3', packages=['html2text'], - test_suite='test', ) diff --git a/sitecustomize.py b/sitecustomize.py deleted file mode 100644 index 973a2655..00000000 --- a/sitecustomize.py +++ /dev/null @@ -1,9 +0,0 @@ -import os - -import coverage - - -here = os.getcwd() -config_file = os.path.join(here, '.coveragerc') -os.environ['COVERAGE_PROCESS_START'] = config_file -coverage.process_startup() diff --git a/test/test_html2text.py b/test/test_html2text.py index a48b8c5d..8b07ae41 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -2,10 +2,13 @@ import glob import html2text import os +import pytest import re import subprocess import sys -import unittest + + +skip = object() def cleanup_eol(clean_str): @@ -18,28 +21,160 @@ def cleanup_eol(clean_str): return clean_str -def test_module(fn, google_doc=False, **kwargs): +def generate_testdata(): + test_dir_name = os.path.dirname(os.path.realpath(__file__)) + for fn in glob.glob("%s/*.html" % test_dir_name): + module_args = {} + cmdline_args = [] + func_args = {} + base_fn = os.path.basename(fn).lower() + + if base_fn.startswith('default_image_alt'): + module_args['default_image_alt'] = 'Image' + cmdline_args.append('--default-image-alt=Image') + func_args = skip + + if base_fn.startswith('google'): + module_args['google_doc'] = True + cmdline_args.append('--googledoc') + func_args = skip + + if base_fn.find('unicode') >= 0: + module_args['unicode_snob'] = True + # There is no command-line option to control unicode_snob. + cmdline_args = skip + func_args = skip + + if base_fn.find('flip_emphasis') >= 0: + module_args['emphasis_mark'] = '*' + module_args['strong_mark'] = '__' + cmdline_args.append('-e') + func_args = skip + + if base_fn.find('escape_snob') >= 0: + module_args['escape_snob'] = True + cmdline_args.append('--escape-all') + func_args = skip + + if base_fn.find('table_bypass') >= 0: + module_args['bypass_tables'] = True + cmdline_args.append('--bypass-tables') + func_args = skip + + if base_fn.startswith('table_ignore'): + module_args['ignore_tables'] = True + cmdline_args.append('--ignore-tables') + func_args = skip + + if base_fn.startswith('bodywidth'): + module_args['body_width'] = 0 + cmdline_args.append('--body-width=0') + func_args['bodywidth'] = 0 + + if base_fn.startswith('protect_links'): + module_args['protect_links'] = True + cmdline_args.append('--protect-links') + func_args = skip + + if base_fn.startswith('images_as_html'): + module_args['images_as_html'] = True + cmdline_args.append('--images-as-html') + func_args = skip + + if base_fn.startswith('images_to_alt'): + module_args['images_to_alt'] = True + cmdline_args.append('--images-to-alt') + func_args = skip + + if base_fn.startswith('images_with_size'): + module_args['images_with_size'] = True + cmdline_args.append('--images-with-size') + func_args = skip + + if base_fn.startswith('single_line_break'): + module_args['body_width'] = 0 + cmdline_args.append('--body-width=0') + module_args['single_line_break'] = True + cmdline_args.append('--single-line-break') + func_args = skip + + if base_fn.startswith('no_inline_links'): + module_args['inline_links'] = False + cmdline_args.append('--reference-links') + func_args = skip + + if base_fn.startswith('no_wrap_links'): + module_args['wrap_links'] = False + cmdline_args.append('--no-wrap-links') + func_args = skip + + if base_fn.startswith('mark_code'): + module_args['mark_code'] = True + cmdline_args.append('--mark-code') + func_args = skip + + if base_fn.startswith('pad_table'): + module_args['pad_tables'] = True + cmdline_args.append('--pad-tables') + func_args = skip + + if base_fn.startswith('wrap_list_items'): + module_args['wrap_list_items'] = True + cmdline_args.append('--wrap-list-items') + func_args = skip + + if base_fn == 'inplace_baseurl_substitution.html': + module_args['baseurl'] = 'http://brettterpstra.com' + module_args['body_width'] = 0 + func_args['baseurl'] = 'http://brettterpstra.com' + func_args['bodywidth'] = 0 + # CLI doesn't support baseurl. + cmdline_args = skip + + yield fn, module_args, cmdline_args, func_args + + +def generate_module_testdata(): + for fn, module_args, cmdline_args, func_args in generate_testdata(): + yield fn, module_args + + +def generate_command_testdata(): + for fn, module_args, cmdline_args, func_args in generate_testdata(): + if cmdline_args is not skip: + yield fn, cmdline_args + + +def generate_function_testdata(): + for fn, module_args, cmdline_args, func_args in generate_testdata(): + if func_args is not skip: + yield fn, func_args + + +@pytest.mark.parametrize("fn,module_args", generate_module_testdata()) +def test_module(fn, module_args): h = html2text.HTML2Text() h.fn = fn - if google_doc: + if module_args.pop('google_doc', False): h.google_doc = True h.ul_item_mark = '-' h.body_width = 0 h.hide_strikethrough = True - for k, v in kwargs.items(): + for k, v in module_args.items(): setattr(h, k, v) result = get_baseline(fn) with open(fn) as inf: actual = cleanup_eol(inf.read()) actual = h.handle(actual) - return result, actual + assert result == actual -def test_command(fn, *args): - args = list(args) +@pytest.mark.parametrize("fn,cmdline_args", generate_command_testdata()) +def test_command(fn, cmdline_args): + args = list(cmdline_args) cmd = [sys.executable, '-m', 'html2text'] if '--googledoc' in args: @@ -58,14 +193,15 @@ def test_command(fn, *args): actual = cleanup_eol(actual) - return result, actual + assert result == actual -def test_function(fn, **kwargs): +@pytest.mark.parametrize("fn,func_args", generate_function_testdata()) +def test_function(fn, func_args): with open(fn) as inf: - actual = html2text.html2text(inf.read(), **kwargs) + actual = html2text.html2text(inf.read(), **func_args) result = get_baseline(fn) - return result, actual + assert result == actual def get_baseline_name(fn): @@ -80,169 +216,17 @@ def get_baseline(fn): return out -class TestHTML2Text(unittest.TestCase): - def _skip_certain_tags(self, h2t, tag, attrs, start): +def test_tag_callback(): + def _skip_certain_tags(h2t, tag, attrs, start): if tag == 'b': return True - def test_tag_callback(self): - h = html2text.HTML2Text() - h.tag_callback = self._skip_certain_tags - ret = h.handle( - 'this is a txt and this is a' - ' with text and ' - 'some italics too.' - ) - self.assertEqual( - ret, - 'this is a txt and this is a' - ' with text and ' - 'some _italics_ too.\n\n' - ) - - -def generate_test(fn): - def _test_mod(self): - self.maxDiff = None - result, actual = test_module(fn, **module_args) - self.assertEqual(result, actual) - - def _test_cmd(self): - # Because there is no command-line option to control unicode_snob - if 'unicode_snob' not in module_args: - self.maxDiff = None - result, actual = test_command(fn, *cmdline_args) - self.assertEqual(result, actual) - - def _test_func(self): - result, actual = test_function(fn, **func_args) - self.assertEqual(result, actual) - - module_args = {} - cmdline_args = [] - func_args = {} - skip_func = False - base_fn = os.path.basename(fn).lower() - - if base_fn.startswith('default_image_alt'): - module_args['default_image_alt'] = 'Image' - cmdline_args.append('--default-image-alt=Image') - skip_func = True - - if base_fn.startswith('google'): - module_args['google_doc'] = True - cmdline_args.append('--googledoc') - skip_func = True - - if base_fn.find('unicode') >= 0: - module_args['unicode_snob'] = True - skip_func = True - - if base_fn.find('flip_emphasis') >= 0: - module_args['emphasis_mark'] = '*' - module_args['strong_mark'] = '__' - cmdline_args.append('-e') - skip_func = True - - if base_fn.find('escape_snob') >= 0: - module_args['escape_snob'] = True - cmdline_args.append('--escape-all') - skip_func = True - - if base_fn.find('table_bypass') >= 0: - module_args['bypass_tables'] = True - cmdline_args.append('--bypass-tables') - skip_func = True - - if base_fn.startswith('table_ignore'): - module_args['ignore_tables'] = True - cmdline_args.append('--ignore-tables') - skip_func = True - - if base_fn.startswith('bodywidth'): - # module_args['unicode_snob'] = True - module_args['body_width'] = 0 - cmdline_args.append('--body-width=0') - func_args['bodywidth'] = 0 - - if base_fn.startswith('protect_links'): - module_args['protect_links'] = True - cmdline_args.append('--protect-links') - skip_func = True - - if base_fn.startswith('images_as_html'): - module_args['images_as_html'] = True - cmdline_args.append('--images-as-html') - skip_func = True - - if base_fn.startswith('images_to_alt'): - module_args['images_to_alt'] = True - cmdline_args.append('--images-to-alt') - skip_func = True - - if base_fn.startswith('images_with_size'): - module_args['images_with_size'] = True - cmdline_args.append('--images-with-size') - skip_func = True - - if base_fn.startswith('single_line_break'): - module_args['body_width'] = 0 - cmdline_args.append('--body-width=0') - module_args['single_line_break'] = True - cmdline_args.append('--single-line-break') - skip_func = True - - if base_fn.startswith('no_inline_links'): - module_args['inline_links'] = False - cmdline_args.append('--reference-links') - skip_func = True - - if base_fn.startswith('no_wrap_links'): - module_args['wrap_links'] = False - cmdline_args.append('--no-wrap-links') - skip_func = True - - if base_fn.startswith('mark_code'): - module_args['mark_code'] = True - cmdline_args.append('--mark-code') - skip_func = True - - if base_fn.startswith('pad_table'): - module_args['pad_tables'] = True - cmdline_args.append('--pad-tables') - skip_func = True - - if base_fn.startswith('wrap_list_items'): - module_args['wrap_list_items'] = True - cmdline_args.append('--wrap-list-items') - skip_func = True - - if base_fn == 'inplace_baseurl_substitution.html': - module_args['baseurl'] = 'http://brettterpstra.com' - module_args['body_width'] = 0 - func_args['baseurl'] = 'http://brettterpstra.com' - func_args['bodywidth'] = 0 - # there is no way to specify baseurl in cli :( - test_cmd = None - else: - test_cmd = _test_cmd - - if skip_func: - test_func = None - else: - test_func = _test_func - - return _test_mod, test_cmd, test_func - - -# Originally from https://stackoverflow.com/q/32899 -# how-to-generate-dynamic-parametrized-unit-tests-in-python -test_dir_name = os.path.dirname(os.path.realpath(__file__)) -for fn in glob.glob("%s/*.html" % test_dir_name): - test_name = 'test_%s' % os.path.splitext(os.path.basename(fn))[0].lower() - test_m, test_c, test_func = generate_test(fn) - setattr(TestHTML2Text, test_name + "_mod", test_m) - if test_c: - setattr(TestHTML2Text, test_name + "_cmd", test_c) - if test_func: - setattr(TestHTML2Text, test_name + "_func", test_func) + h = html2text.HTML2Text() + h.tag_callback = _skip_certain_tags + ret = h.handle( + 'this is a txt and this is a with text and ' + 'some italics too.' + ) + assert ret == ( + 'this is a txt and this is a with text and some _italics_ too.\n\n' + ) diff --git a/test/test_memleak.py b/test/test_memleak.py index 442692c3..019b4e3f 100644 --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -1,24 +1,19 @@ import html2text -import unittest +# See https://github.com/Alir3z4/html2text/issues/13 for more information. -class TestMemleak(unittest.TestCase): - """ - See https://github.com/Alir3z4/html2text/issues/13 for more - information on this. - """ +INSTR = 'miow ' - def setUp(self): - self.instr = 'miow ' - def test_same_string(self): - h2t = html2text.HTML2Text() - result = h2t.handle(self.instr) - # Now, we shouldn't get leak of the previous run to the new one - self.assertEqual(h2t.handle(self.instr), result) +def test_same_string(): + h2t = html2text.HTML2Text() + result = h2t.handle(INSTR) + # Now, we shouldn't get leak of the previous run to the new one. + assert h2t.handle(INSTR) == result - def test_empty_string(self): - h2t = html2text.HTML2Text() - h2t.handle(self.instr) - # And even less when the input is empty - self.assertEqual(h2t.handle(''), '\n\n') + +def test_empty_string(): + h2t = html2text.HTML2Text() + h2t.handle(INSTR) + # And even less when the input is empty. + assert h2t.handle('') == '\n\n' diff --git a/tox.ini b/tox.ini index e6201ad5..f47c682c 100644 --- a/tox.ini +++ b/tox.ini @@ -6,11 +6,10 @@ minversion = 1.9 [testenv] commands = - coverage run setup.py test - coverage combine - coverage report + pytest --cov=html2text deps = - coverage + pytest + pytest-cov [testenv:flake8] commands = From ca32fb8ecec7f0f60fdb864e13c96a8c58f00b9f Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 11:16:52 -0800 Subject: [PATCH 414/479] Remove 'pragma' comments for a more accurate coverage report It is okay that these lines are not yet tested, but we don't need to hide that fact from the coverage report. As time goes on, we may add tests for them. If that happens, it should be reflected in the coverage metrics. --- html2text/__init__.py | 6 +++--- html2text/cli.py | 4 ++-- html2text/utils.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6569f75c..af3179c2 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -90,9 +90,9 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.open_quote = config.OPEN_QUOTE # covered in cli self.close_quote = config.CLOSE_QUOTE # covered in cli - if out is None: # pragma: no cover + if out is None: self.out = self.outtextf - else: # pragma: no cover + else: self.out = out # empty list to store output characters before they are "joined" @@ -209,7 +209,7 @@ def previousIndex(self, attrs): self.a list. If the set of attributes is not found, returns None :rtype: int """ - if 'href' not in attrs: # pragma: no cover + if 'href' not in attrs: return None i = -1 for a in self.a: diff --git a/html2text/cli.py b/html2text/cli.py index 25457ba1..1d25dac5 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -7,7 +7,7 @@ def main(): baseurl = '' - class bcolors: # pragma: no cover + class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' @@ -242,7 +242,7 @@ class bcolors: # pragma: no cover p.add_argument('encoding', nargs='?', default='utf-8') args = p.parse_args() - if args.filename and args.filename != '-': # pragma: no cover + if args.filename and args.filename != '-': with open(args.filename, 'rb') as fp: data = fp.read() else: diff --git a/html2text/utils.py b/html2text/utils.py index fb3f6fe5..e6540379 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -20,7 +20,7 @@ def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) - if n in range(1, 10): # pragma: no branch + if n in range(1, 10): return n except ValueError: return 0 @@ -58,7 +58,7 @@ def dumb_css_parser(data): elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] try: elements = {a.strip(): dumb_property_dict(b) for a, b in elements} - except ValueError: # pragma: no cover + except ValueError: elements = {} # not that important return elements @@ -208,7 +208,7 @@ def wrapwrite(text): sys.stdout.write(text) -def wrap_read(): # pragma: no cover +def wrap_read(): """ :rtype: str """ From ce7fb974a7be23318ebe99af678a7b6e04d8a6bf Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 11:50:11 -0800 Subject: [PATCH 415/479] Replace 0/1 with False/True when used as a bool --- html2text/__init__.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6569f75c..ac1faf67 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -101,8 +101,8 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 - self.start = 1 - self.space = 0 + self.start = True + self.space = False self.a = [] self.astack = [] self.maybe_automatic_link = None @@ -111,12 +111,12 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.acount = 0 self.list = [] self.blockquote = 0 - self.pre = 0 - self.startpre = 0 + self.pre = False + self.startpre = False self.code = False self.quote = False self.br_toggle = '' - self.lastWasNL = 0 + self.lastWasNL = False self.lastWasList = False self.style = 0 self.style_def = {} @@ -214,7 +214,7 @@ def previousIndex(self, attrs): i = -1 for a in self.a: i += 1 - match = 0 + match = False if 'href' in a and a['href'] == attrs['href']: if 'title' in a or 'title' in attrs: @@ -272,7 +272,7 @@ def handle_emphasis(self, start, tag_style, parent_style): if bold or italic or fixed: # there must not be whitespace before closing emphasis mark self.emphasis -= 1 - self.space = 0 + self.space = False if fixed: if self.drop_white_space: # empty emphasis, drop it @@ -386,7 +386,7 @@ def handle_tag(self, tag, attrs, start): if start: self.p() self.o('> ', 0, 1) - self.start = 1 + self.start = True self.blockquote += 1 else: self.blockquote -= 1 @@ -616,7 +616,7 @@ def link_url(self, link, title=""): elif li['name'] == "ol": li['num'] += 1 self.o(str(li['num']) + ". ") - self.start = 1 + self.start = True if tag in ["table", "tr", "td", "th"]: if self.ignore_tables: @@ -673,10 +673,10 @@ def link_url(self, link, title=""): if tag == "pre": if start: - self.startpre = 1 - self.pre = 1 + self.startpre = True + self.pre = True else: - self.pre = 0 + self.pre = False if self.mark_code: self.out("\n[/code]") self.p() @@ -719,7 +719,7 @@ def o(self, data, puredata=0, force=0): # (see entityref) data = re.sub(r'\s+', r' ', data) if data and data[0] == ' ': - self.space = 1 + self.space = True data = data[1:] if not data and not force: return @@ -746,31 +746,31 @@ def o(self, data, puredata=0, force=0): data = data.replace("\n", "\n" + bq) if self.startpre: - self.startpre = 0 + self.startpre = False if self.list: # use existing initial indentation data = data.lstrip("\n") if self.start: - self.space = 0 + self.space = False self.p_p = 0 - self.start = 0 + self.start = False if force == 'end': # It's the end. self.p_p = 0 self.out("\n") - self.space = 0 + self.space = False if self.p_p: self.out((self.br_toggle + '\n' + bq) * self.p_p) - self.space = 0 + self.space = False self.br_toggle = '' if self.space: if not self.lastWasNL: self.out(' ') - self.space = 0 + self.space = False if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"): From 74a89a909aaa0ea8ae98d30a51b5c518b178ff03 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 11:13:12 -0800 Subject: [PATCH 416/479] Remove shebang from __init__.py The file is not executable, no should not be executed directly, but as python -m html2text ... Or from the setuptools installed entry point. --- html2text/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 6569f75c..ddc645cc 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division From 26e986679b9070f7a52e7cc1446970166ae8bbf8 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 10:59:52 -0800 Subject: [PATCH 417/479] Introduce isort for automated consistent sorting of Python imports This tool automates import sorting across Python files. This removes the need for contributors to think about the project's styling preferences for imports. Just let the tool do it instead. isort is a well established tool used by many projects. https://github.com/timothycrosley/isort --- .travis.yml | 1 + html2text/__init__.py | 25 ++++++++++++------------- html2text/cli.py | 4 ++-- html2text/compat.py | 1 - setup.cfg | 6 ++++++ test/test_html2text.py | 4 ++-- tox.ini | 8 ++++++++ 7 files changed, 31 insertions(+), 18 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4dfbf9e7..159d65fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ cache: pip matrix: include: - env: TOXENV=flake8 + - env: TOXENV=isort - python: "2.7" env: TOXENV=py27 - python: "3.4" diff --git a/html2text/__init__.py b/html2text/__init__.py index ddc645cc..648b7a2a 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -1,29 +1,28 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" -from __future__ import division -from __future__ import unicode_literals +from __future__ import division, unicode_literals + import re import sys from textwrap import wrap -from html2text.compat import urlparse, HTMLParser from html2text import config - +from html2text.compat import HTMLParser, urlparse from html2text.utils import ( - name2cp, - unifiable_n, - google_text_emphasis, - google_fixed_width_font, + dumb_css_parser, element_style, - hn, - google_has_height, escape_md, + escape_md_section, + google_fixed_width_font, + google_has_height, google_list_style, + google_text_emphasis, + hn, list_numbering_start, - dumb_css_parser, - escape_md_section, + name2cp, + pad_tables_in_text, skipwrap, - pad_tables_in_text + unifiable_n, ) try: diff --git a/html2text/cli.py b/html2text/cli.py index 25457ba1..88de1476 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,7 +1,7 @@ import argparse -from html2text import HTML2Text, config, __version__ -from html2text.utils import wrapwrite, wrap_read +from html2text import HTML2Text, __version__, config +from html2text.utils import wrap_read, wrapwrite def main(): diff --git a/html2text/compat.py b/html2text/compat.py index e19baef2..fde6c16c 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -1,6 +1,5 @@ import sys - if sys.version_info[0] == 2: import htmlentitydefs import urlparse diff --git a/setup.cfg b/setup.cfg index 2a9acf13..22315413 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,8 @@ [bdist_wheel] universal = 1 + +[isort] +combine_as_imports = True +include_trailing_comma = True +line_length = 88 +multi_line_output = 3 diff --git a/test/test_html2text.py b/test/test_html2text.py index 8b07ae41..013ee179 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -1,12 +1,12 @@ import codecs import glob -import html2text import os -import pytest import re import subprocess import sys +import html2text +import pytest skip = object() diff --git a/tox.ini b/tox.ini index f47c682c..be290b1d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,7 @@ [tox] envlist = flake8 + isort py{27,34,35,36,37,py,py3} minversion = 1.9 @@ -17,3 +18,10 @@ commands = deps = flake8 skip_install = true + +[testenv:isort] +commands = + isort --check-only --diff +deps = + isort +skip_install = true From 8194cda63b77b1d4538cff0a4146d75858ac3d97 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 11:24:57 -0800 Subject: [PATCH 418/479] Introduce black to automate Python code formatting Use Black, a Python code formatter, to automate all source code formatting. Helps new contributors forget about the project's code formatting preference and just lets to the tool handle it. https://github.com/ambv/black > Black is the uncompromising Python code formatter. By using it, you > agree to cede control over minutiae of hand-formatting. In return, > Black gives you speed, determinism, and freedom from pycodestyle > nagging about formatting. You will save time and mental energy for > more important matters. --- .travis.yml | 8 +- html2text/__init__.py | 306 +++++++++++++++++++++-------------------- html2text/cli.py | 140 ++++++++++--------- html2text/compat.py | 2 +- html2text/config.py | 120 ++++++++-------- html2text/utils.py | 119 ++++++++-------- setup.cfg | 6 + setup.py | 50 ++++--- test/test_html2text.py | 156 +++++++++++---------- test/test_memleak.py | 4 +- tox.ini | 11 ++ 11 files changed, 486 insertions(+), 436 deletions(-) diff --git a/.travis.yml b/.travis.yml index 159d65fa..14b9f19a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,12 @@ cache: pip matrix: include: - - env: TOXENV=flake8 - - env: TOXENV=isort + - python: "3.7" + env: TOXENV=black + - python: "3.7" + env: TOXENV=flake8 + - python: "3.7" + env: TOXENV=isort - python: "2.7" env: TOXENV=py27 - python: "3.4" diff --git a/html2text/__init__.py b/html2text/__init__.py index 648b7a2a..c16f2bf2 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -27,10 +27,10 @@ try: chr = unichr - nochr = unicode('') + nochr = unicode("") except NameError: # python3 uses chr - nochr = str('') + nochr = str("") __version__ = (2018, 1, 9) @@ -40,7 +40,7 @@ class HTML2Text(HTMLParser.HTMLParser): - def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): + def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): """ Input parameters: out: possible custom replacement for self.outtextf (which @@ -49,7 +49,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): """ kwargs = {} if sys.version_info >= (3, 4): - kwargs['convert_charrefs'] = False + kwargs["convert_charrefs"] = False HTMLParser.HTMLParser.__init__(self, **kwargs) # Config options @@ -73,9 +73,9 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.bypass_tables = config.BYPASS_TABLES # covered in cli self.ignore_tables = config.IGNORE_TABLES # covered in cli self.google_doc = False # covered in cli - self.ul_item_mark = '*' # covered in cli - self.emphasis_mark = '_' # covered in cli - self.strong_mark = '**' + self.ul_item_mark = "*" # covered in cli + self.emphasis_mark = "_" # covered in cli + self.strong_mark = "**" self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli self.hide_strikethrough = False # covered in cli @@ -105,7 +105,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.astack = [] self.maybe_automatic_link = None self.empty_link = False - self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') + self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") self.acount = 0 self.list = [] self.blockquote = 0 @@ -113,7 +113,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.startpre = 0 self.code = False self.quote = False - self.br_toggle = '' + self.br_toggle = "" self.lastWasNL = 0 self.lastWasList = False self.style = 0 @@ -132,10 +132,10 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): self.current_tag = None try: - del unifiable_n[name2cp('nbsp')] + del unifiable_n[name2cp("nbsp")] except KeyError: pass - config.UNIFIABLE['nbsp'] = ' _place_holder;' + config.UNIFIABLE["nbsp"] = " _place_holder;" def feed(self, data): data = data.replace("", "") @@ -153,24 +153,24 @@ def handle(self, data): def outtextf(self, s): self.outtextlist.append(s) if s: - self.lastWasNL = s[-1] == '\n' + self.lastWasNL = s[-1] == "\n" def close(self): HTMLParser.HTMLParser.close(self) self.pbr() - self.o('', 0, 'end') + self.o("", 0, "end") outtext = nochr.join(self.outtextlist) if self.unicode_snob: - nbsp = chr(name2cp('nbsp')) + nbsp = chr(name2cp("nbsp")) else: nbsp = chr(32) try: - outtext = outtext.replace(unicode(' _place_holder;'), nbsp) + outtext = outtext.replace(unicode(" _place_holder;"), nbsp) except NameError: - outtext = outtext.replace(' _place_holder;', nbsp) + outtext = outtext.replace(" _place_holder;", nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. @@ -207,18 +207,20 @@ def previousIndex(self, attrs): self.a list. If the set of attributes is not found, returns None :rtype: int """ - if 'href' not in attrs: # pragma: no cover + if "href" not in attrs: # pragma: no cover return None i = -1 for a in self.a: i += 1 match = 0 - if 'href' in a and a['href'] == attrs['href']: - if 'title' in a or 'title' in attrs: - if 'title' in a and \ - 'title' in attrs and \ - a['title'] == attrs['title']: + if "href" in a and a["href"] == attrs["href"]: + if "title" in a or "title" in attrs: + if ( + "title" in a + and "title" in attrs + and a["title"] == attrs["title"] + ): match = True else: match = True @@ -234,20 +236,21 @@ def handle_emphasis(self, start, tag_style, parent_style): parent_emphasis = google_text_emphasis(parent_style) # handle Google's text emphasis - strikethrough = 'line-through' in \ - tag_emphasis and self.hide_strikethrough + strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough # google and others may mark a font's weight as `bold` or `700` bold = False for bold_marker in config.BOLD_TEXT_STYLE_VALUES: - bold = (bold_marker in tag_emphasis - and bold_marker not in parent_emphasis) + bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis if bold: break - italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis - fixed = google_fixed_width_font(tag_style) and not \ - google_fixed_width_font(parent_style) and not self.pre + italic = "italic" in tag_emphasis and "italic" not in parent_emphasis + fixed = ( + google_fixed_width_font(tag_style) + and not google_fixed_width_font(parent_style) + and not self.pre + ) if start: # crossed-out text must be handled before other attributes @@ -263,7 +266,7 @@ def handle_emphasis(self, start, tag_style, parent_style): self.o(self.strong_mark) self.drop_white_space += 1 if fixed: - self.o('`') + self.o("`") self.drop_white_space += 1 self.code = True else: @@ -276,7 +279,7 @@ def handle_emphasis(self, start, tag_style, parent_style): # empty emphasis, drop it self.drop_white_space -= 1 else: - self.o('`') + self.o("`") self.code = False if bold: if self.drop_white_space: @@ -310,9 +313,12 @@ def handle_tag(self, tag, attrs, start): # first thing inside the anchor tag is another tag # that produces some output - if (start and self.maybe_automatic_link is not None and - tag not in ['p', 'div', 'style', 'dl', 'dt'] and - (tag != "img" or self.ignore_images)): + if ( + start + and self.maybe_automatic_link is not None + and tag not in ["p", "div", "style", "dl", "dt"] + and (tag != "img" or self.ignore_images) + ): self.o("[") self.maybe_automatic_link = None self.empty_link = False @@ -329,8 +335,9 @@ def handle_tag(self, tag, attrs, start): tag_style = element_style(attrs, self.style_def, parent_style) self.tag_stack.append((tag, attrs, tag_style)) else: - dummy, attrs, tag_style = self.tag_stack.pop() \ - if self.tag_stack else (None, {}, {}) + dummy, attrs, tag_style = ( + self.tag_stack.pop() if self.tag_stack else (None, {}, {}) + ) if self.tag_stack: parent_style = self.tag_stack[-1][2] @@ -338,18 +345,18 @@ def handle_tag(self, tag, attrs, start): self.p() if start: self.inheader = True - self.o(hn(tag) * "#" + ' ') + self.o(hn(tag) * "#" + " ") else: self.inheader = False return # prevent redundant emphasis marks on headers - if tag in ['p', 'div']: + if tag in ["p", "div"]: if self.google_doc: if start and google_has_height(tag_style): self.p() else: self.soft_br() - elif self.astack and tag == 'div': + elif self.astack and tag == "div": pass else: self.p() @@ -365,7 +372,7 @@ def handle_tag(self, tag, attrs, start): self.o("* * *") self.p() - if tag in ["head", "style", 'script']: + if tag in ["head", "style", "script"]: if start: self.quiet += 1 else: @@ -383,7 +390,7 @@ def handle_tag(self, tag, attrs, start): if tag == "blockquote": if start: self.p() - self.o('> ', 0, 1) + self.o("> ", 0, 1) self.start = 1 self.blockquote += 1 else: @@ -391,12 +398,11 @@ def handle_tag(self, tag, attrs, start): self.p() def no_preceding_space(self): - return (self.preceding_data - and re.match(r'[^\s]', self.preceding_data[-1])) + return self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1]) - if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: + if tag in ["em", "i", "u"] and not self.ignore_emphasis: if start and no_preceding_space(self): - emphasis = ' ' + self.emphasis_mark + emphasis = " " + self.emphasis_mark else: emphasis = self.emphasis_mark @@ -404,9 +410,9 @@ def no_preceding_space(self): if start: self.stressed = True - if tag in ['strong', 'b'] and not self.ignore_emphasis: + if tag in ["strong", "b"] and not self.ignore_emphasis: if start and no_preceding_space(self): - strong = ' ' + self.strong_mark + strong = " " + self.strong_mark else: strong = self.strong_mark @@ -414,11 +420,11 @@ def no_preceding_space(self): if start: self.stressed = True - if tag in ['del', 'strike', 's']: + if tag in ["del", "strike", "s"]: if start and no_preceding_space(self): - strike = ' ~~' + strike = " ~~" else: - strike = '~~' + strike = "~~" self.o(strike) if start: @@ -430,20 +436,20 @@ def no_preceding_space(self): self.handle_emphasis(start, tag_style, parent_style) if tag in ["kbd", "code", "tt"] and not self.pre: - self.o('`') # TODO: `` `this` `` + self.o("`") # TODO: `` `this` `` self.code = not self.code if tag == "abbr": if start: self.abbr_title = None - self.abbr_data = '' - if ('title' in attrs): - self.abbr_title = attrs['title'] + self.abbr_data = "" + if "title" in attrs: + self.abbr_title = attrs["title"] else: if self.abbr_title is not None: self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_title = None - self.abbr_data = '' + self.abbr_data = "" if tag == "q": if not self.quote: @@ -454,21 +460,21 @@ def no_preceding_space(self): def link_url(self, link, title=""): url = urlparse.urljoin(self.baseurl, link) - title = ' "{}"'.format(title) if title.strip() else '' - self.o(']({url}{title})'.format(url=escape_md(url), - title=title)) + title = ' "{}"'.format(title) if title.strip() else "" + self.o("]({url}{title})".format(url=escape_md(url), title=title)) if tag == "a" and not self.ignore_links: if start: - if 'href' in attrs and \ - attrs['href'] is not None and not \ - (self.skip_internal_links and - attrs['href'].startswith('#')): + if ( + "href" in attrs + and attrs["href"] is not None + and not (self.skip_internal_links and attrs["href"].startswith("#")) + ): self.astack.append(attrs) - self.maybe_automatic_link = attrs['href'] + self.maybe_automatic_link = attrs["href"] self.empty_link = True if self.protect_links: - attrs['href'] = '<' + attrs['href'] + '>' + attrs["href"] = "<" + attrs["href"] + ">" else: self.astack.append(None) else: @@ -483,34 +489,33 @@ def link_url(self, link, title=""): self.maybe_automatic_link = None if self.inline_links: try: - title = a['title'] if a['title'] else '' + title = a["title"] if a["title"] else "" title = escape_md(title) except KeyError: - link_url(self, a['href'], '') + link_url(self, a["href"], "") else: - link_url(self, a['href'], title) + link_url(self, a["href"], title) else: i = self.previousIndex(a) if i is not None: a = self.a[i] else: self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount + a["count"] = self.acount + a["outcount"] = self.outcount self.a.append(a) - self.o("][" + str(a['count']) + "]") + self.o("][" + str(a["count"]) + "]") if tag == "img" and start and not self.ignore_images: - if 'src' in attrs: + if "src" in attrs: if not self.images_to_alt: - attrs['href'] = attrs['src'] - alt = attrs.get('alt') or self.default_image_alt + attrs["href"] = attrs["src"] + alt = attrs.get("alt") or self.default_image_alt # If we have images_with_size, write raw html including width, # height, and alt attributes if self.images_as_html or ( - self.images_with_size and - ("width" in attrs or "height" in attrs) + self.images_with_size and ("width" in attrs or "height" in attrs) ): self.o("") self.empty_link = False return @@ -542,16 +550,9 @@ def link_url(self, link, title=""): else: self.o("![" + escape_md(alt) + "]") if self.inline_links: - href = attrs.get('href') or '' + href = attrs.get("href") or "" self.o( - "(" + - escape_md( - urlparse.urljoin( - self.baseurl, - href - ) - ) + - ")" + "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" ) else: i = self.previousIndex(attrs) @@ -559,18 +560,18 @@ def link_url(self, link, title=""): attrs = self.a[i] else: self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount + attrs["count"] = self.acount + attrs["outcount"] = self.outcount self.a.append(attrs) - self.o("[" + str(attrs['count']) + "]") + self.o("[" + str(attrs["count"]) + "]") - if tag == 'dl' and start: + if tag == "dl" and start: self.p() - if tag == 'dt' and not start: + if tag == "dt" and not start: self.pbr() - if tag == 'dd' and start: - self.o(' ') - if tag == 'dd' and not start: + if tag == "dd" and start: + self.o(" ") + if tag == "dd" and not start: self.pbr() if tag in ["ol", "ul"]: @@ -583,42 +584,39 @@ def link_url(self, link, title=""): else: list_style = tag numbering_start = list_numbering_start(attrs) - self.list.append({ - 'name': list_style, - 'num': numbering_start - }) + self.list.append({"name": list_style, "num": numbering_start}) else: if self.list: self.list.pop() if (not self.google_doc) and (not self.list): - self.o('\n') + self.o("\n") self.lastWasList = True else: self.lastWasList = False - if tag == 'li': + if tag == "li": self.pbr() if start: if self.list: li = self.list[-1] else: - li = {'name': 'ul', 'num': 0} + li = {"name": "ul", "num": 0} if self.google_doc: nest_count = self.google_nest_count(tag_style) else: nest_count = len(self.list) # TODO: line up
    1. s > 9 correctly. self.o(" " * nest_count) - if li['name'] == "ul": + if li["name"] == "ul": self.o(self.ul_item_mark + " ") - elif li['name'] == "ol": - li['num'] += 1 - self.o(str(li['num']) + ". ") + elif li["name"] == "ol": + li["num"] += 1 + self.o(str(li["num"]) + ". ") self.start = 1 if tag in ["table", "tr", "td", "th"]: if self.ignore_tables: - if tag == 'tr': + if tag == "tr": if start: pass else: @@ -631,14 +629,14 @@ def link_url(self, link, title=""): self.soft_br() if tag in ["td", "th"]: if start: - self.o('<{}>\n\n'.format(tag)) + self.o("<{}>\n\n".format(tag)) else: - self.o('\n'.format(tag)) + self.o("\n".format(tag)) else: if start: - self.o('<{}>'.format(tag)) + self.o("<{}>".format(tag)) else: - self.o(''.format(tag)) + self.o("".format(tag)) else: if tag == "table": @@ -692,7 +690,7 @@ def p(self): def soft_br(self): "Soft breaks" self.pbr() - self.br_toggle = ' ' + self.br_toggle = " " def o(self, data, puredata=0, force=0): """ @@ -708,15 +706,15 @@ def o(self, data, puredata=0, force=0): lstripped_data = data.lstrip() if self.drop_white_space and not (self.pre or self.code): data = lstripped_data - if lstripped_data != '': + if lstripped_data != "": self.drop_white_space = 0 if puredata and not self.pre: # This is a very dangerous call ... it could mess up # all handling of   when not handled properly # (see entityref) - data = re.sub(r'\s+', r' ', data) - if data and data[0] == ' ': + data = re.sub(r"\s+", r" ", data) + if data and data[0] == " ": self.space = 1 data = data[1:] if not data and not force: @@ -731,7 +729,7 @@ def o(self, data, puredata=0, force=0): self.out("\n[code]") self.p_p = 0 - bq = (">" * self.blockquote) + bq = ">" * self.blockquote if not (force and data and data[0] == ">") and self.blockquote: bq += " " @@ -754,34 +752,39 @@ def o(self, data, puredata=0, force=0): self.p_p = 0 self.start = 0 - if force == 'end': + if force == "end": # It's the end. self.p_p = 0 self.out("\n") self.space = 0 if self.p_p: - self.out((self.br_toggle + '\n' + bq) * self.p_p) + self.out((self.br_toggle + "\n" + bq) * self.p_p) self.space = 0 - self.br_toggle = '' + self.br_toggle = "" if self.space: if not self.lastWasNL: - self.out(' ') + self.out(" ") self.space = 0 - if self.a and ((self.p_p == 2 and self.links_each_paragraph) or - force == "end"): + if self.a and ( + (self.p_p == 2 and self.links_each_paragraph) or force == "end" + ): if force == "end": self.out("\n") newa = [] for link in self.a: - if self.outcount > link['outcount']: - self.out(" [" + str(link['count']) + "]: " + - urlparse.urljoin(self.baseurl, link['href'])) - if 'title' in link: - self.out(" (" + link['title'] + ")") + if self.outcount > link["outcount"]: + self.out( + " [" + + str(link["count"]) + + "]: " + + urlparse.urljoin(self.baseurl, link["href"]) + ) + if "title" in link: + self.out(" (" + link["title"] + ")") self.out("\n") else: newa.append(link) @@ -806,11 +809,13 @@ def handle_data(self, data, entity_char=False): self.stressed = False self.preceding_stressed = True elif self.preceding_stressed: - if (re.match(r'[^\s.!?]', data[0]) - and not hn(self.current_tag) - and self.current_tag not in ['a', 'code', 'pre']): + if ( + re.match(r"[^\s.!?]", data[0]) + and not hn(self.current_tag) + and self.current_tag not in ["a", "code", "pre"] + ): # should match a letter or common punctuation - data = ' ' + data + data = " " + data self.preceding_stressed = False if self.style: @@ -818,8 +823,11 @@ def handle_data(self, data, entity_char=False): if self.maybe_automatic_link is not None: href = self.maybe_automatic_link - if (href == data and self.absolute_url_matcher.match(href) and - self.use_automatic_links): + if ( + href == data + and self.absolute_url_matcher.match(href) + and self.use_automatic_links + ): self.o("<" + data + ">") self.empty_link = False return @@ -834,7 +842,7 @@ def handle_data(self, data, entity_char=False): self.o(data, 1) def charref(self, name): - if name[0] in ['x', 'X']: + if name[0] in ["x", "X"]: c = int(name[1:], 16) else: c = int(name) @@ -845,7 +853,7 @@ def charref(self, name): try: return chr(c) except ValueError: # invalid unicode - return '' + return "" def entityref(self, c): if not self.unicode_snob and c in config.UNIFIABLE: @@ -854,9 +862,9 @@ def entityref(self, c): try: name2cp(c) except KeyError: - return "&" + c + ';' + return "&" + c + ";" else: - if c == 'nbsp': + if c == "nbsp": return config.UNIFIABLE[c] else: return chr(name2cp(c)) @@ -870,9 +878,8 @@ def google_nest_count(self, style): :rtype: int """ nest_count = 0 - if 'margin-left' in style: - nest_count = int(style['margin-left'][:-2]) \ - // self.google_list_indent + if "margin-left" in style: + nest_count = int(style["margin-left"][:-2]) // self.google_list_indent return nest_count @@ -887,7 +894,7 @@ def optwrap(self, text): if not self.body_width: return text - result = '' + result = "" newlines = 0 # I cannot think of a better solution for now. # To avoid the non-wrap behaviour for entire paras @@ -897,14 +904,17 @@ def optwrap(self, text): for para in text.split("\n"): if len(para) > 0: if not skipwrap(para, self.wrap_links, self.wrap_list_items): - indent = '' - if para.startswith(' ' + self.ul_item_mark): - indent = ' ' # For list items. - wrapped = wrap(para, self.body_width, - break_long_words=False, - subsequent_indent=indent) + indent = "" + if para.startswith(" " + self.ul_item_mark): + indent = " " # For list items. + wrapped = wrap( + para, + self.body_width, + break_long_words=False, + subsequent_indent=indent, + ) result += "\n".join(wrapped) - if indent or para.endswith(' '): + if indent or para.endswith(" "): result += " \n" newlines = 1 else: @@ -925,7 +935,7 @@ def optwrap(self, text): return result -def html2text(html, baseurl='', bodywidth=None): +def html2text(html, baseurl="", bodywidth=None): if bodywidth is None: bodywidth = config.BODY_WIDTH h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) diff --git a/html2text/cli.py b/html2text/cli.py index 88de1476..6894136a 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -5,78 +5,80 @@ def main(): - baseurl = '' + baseurl = "" class bcolors: # pragma: no cover - HEADER = '\033[95m' - OKBLUE = '\033[94m' - OKGREEN = '\033[92m' - WARNING = '\033[93m' - FAIL = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKGREEN = "\033[92m" + WARNING = "\033[93m" + FAIL = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" p = argparse.ArgumentParser() p.add_argument( "--default-image-alt", dest="default_image_alt", default=config.DEFAULT_IMAGE_ALT, - help="The default alt string for images with missing ones") + help="The default alt string for images with missing ones", + ) p.add_argument( "--pad-tables", dest="pad_tables", action="store_true", default=config.PAD_TABLES, - help="pad the cells to equal column width in tables" + help="pad the cells to equal column width in tables", ) p.add_argument( "--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, - help="wrap links during conversion" + help="wrap links during conversion", ) p.add_argument( "--wrap-list-items", dest="wrap_list_items", action="store_true", default=config.WRAP_LIST_ITEMS, - help="wrap list items during conversion" + help="wrap list items during conversion", ) p.add_argument( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, - help="don't include any formatting for emphasis" + help="don't include any formatting for emphasis", ) p.add_argument( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, - help="use reference style links instead of inline links" + help="use reference style links instead of inline links", ) p.add_argument( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, - help="don't include any formatting for links") + help="don't include any formatting for links", + ) p.add_argument( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, - help=("protect links from line breaks surrounding them " + - "with angle brackets")) + help="protect links from line breaks surrounding them with angle brackets", + ) p.add_argument( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, - help="don't include any formatting for images" + help="don't include any formatting for images", ) p.add_argument( "--images-as-html", @@ -84,90 +86,98 @@ class bcolors: # pragma: no cover action="store_true", default=config.IMAGES_AS_HTML, help=( - "Always write image tags as raw html; preserves `height`, `width` " - "and `alt` if possible." - ) + "Always write image tags as raw html; preserves `height`, `width` and " + "`alt` if possible." + ), ) p.add_argument( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, - help="Discard image data, only keep alt text" + help="Discard image data, only keep alt text", ) p.add_argument( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, - help="Write image tags with height and width attrs as raw html to " - "retain dimensions" + help=( + "Write image tags with height and width attrs as raw html to retain " + "dimensions" + ), ) p.add_argument( - "-g", "--google-doc", + "-g", + "--google-doc", action="store_true", dest="google_doc", default=False, - help="convert an html-exported Google Document" + help="convert an html-exported Google Document", ) p.add_argument( - "-d", "--dash-unordered-list", + "-d", + "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, - help="use a dash rather than a star for unordered list items" + help="use a dash rather than a star for unordered list items", ) p.add_argument( - "-e", "--asterisk-emphasis", + "-e", + "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, - help="use an asterisk rather than an underscore for emphasized text" + help="use an asterisk rather than an underscore for emphasized text", ) p.add_argument( - "-b", "--body-width", + "-b", + "--body-width", dest="body_width", type=int, default=config.BODY_WIDTH, - help="number of characters per output line, 0 for no wrap" + help="number of characters per output line, 0 for no wrap", ) p.add_argument( - "-i", "--google-list-indent", + "-i", + "--google-list-indent", dest="list_indent", type=int, default=config.GOOGLE_LIST_INDENT, - help="number of pixels Google indents nested lists" + help="number of pixels Google indents nested lists", ) p.add_argument( - "-s", "--hide-strikethrough", + "-s", + "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, - help="hide strike-through text. only relevant when -g is " - "specified as well" + help="hide strike-through text. only relevant when -g is " "specified as well", ) p.add_argument( "--escape-all", action="store_true", dest="escape_snob", default=False, - help="Escape all special characters. Output is less readable, but " - "avoids corner case formatting issues." + help=( + "Escape all special characters. Output is less readable, but avoids " + "corner case formatting issues." + ), ) p.add_argument( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, - help="Format tables in HTML rather than Markdown syntax." + help="Format tables in HTML rather than Markdown syntax.", ) p.add_argument( "--ignore-tables", action="store_true", dest="ignore_tables", default=config.IGNORE_TABLES, - help="Ignore table-related tags (table, th, td, tr) " - "while keeping rows." + help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.", ) p.add_argument( "--single-line-break", @@ -175,51 +185,53 @@ class bcolors: # pragma: no cover dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( - "Use a single line break after a block element rather than two " - "line breaks. NOTE: Requires --body-width=0" - ) + "Use a single line break after a block element rather than two line " + "breaks. NOTE: Requires --body-width=0" + ), ) p.add_argument( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, - help="Use unicode throughout document" + help="Use unicode throughout document", ) p.add_argument( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, - help="Do not use automatic links wherever applicable" + help="Do not use automatic links wherever applicable", ) p.add_argument( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, - help="Do not skip internal links" + help="Do not skip internal links", ) p.add_argument( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, - help="Put links after each paragraph instead of document" + help="Put links after each paragraph instead of document", ) p.add_argument( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, - help="Mark program code blocks with [code]...[/code]" + help="Mark program code blocks with [code]...[/code]", ) p.add_argument( "--decode-errors", dest="decode_errors", default=config.DECODE_ERRORS, - help="What to do in case of decode errors.'ignore', 'strict' and " - "'replace' are acceptable values" + help=( + "What to do in case of decode errors.'ignore', 'strict' and 'replace' are " + "acceptable values" + ), ) p.add_argument( "--open-quote", @@ -234,16 +246,14 @@ class bcolors: # pragma: no cover help="The character used to close quotes", ) p.add_argument( - '--version', - action='version', - version='.'.join(map(str, __version__)) + "--version", action="version", version=".".join(map(str, __version__)) ) - p.add_argument('filename', nargs='?') - p.add_argument('encoding', nargs='?', default='utf-8') + p.add_argument("filename", nargs="?") + p.add_argument("encoding", nargs="?", default="utf-8") args = p.parse_args() - if args.filename and args.filename != '-': # pragma: no cover - with open(args.filename, 'rb') as fp: + if args.filename and args.filename != "-": # pragma: no cover + with open(args.filename, "rb") as fp: data = fp.read() else: data = wrap_read() @@ -252,18 +262,18 @@ class bcolors: # pragma: no cover data = data.decode(args.encoding, args.decode_errors) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC - warning += ' Use the ' + bcolors.OKGREEN - warning += '--decode-errors=ignore' + bcolors.ENDC + ' flag.' + warning += " Use the " + bcolors.OKGREEN + warning += "--decode-errors=ignore" + bcolors.ENDC + " flag." print(warning) raise err h = HTML2Text(baseurl=baseurl) # handle options if args.ul_style_dash: - h.ul_item_mark = '-' + h.ul_item_mark = "-" if args.em_style_asterisk: - h.emphasis_mark = '*' - h.strong_mark = '__' + h.emphasis_mark = "*" + h.strong_mark = "__" h.body_width = args.body_width h.google_list_indent = args.list_indent diff --git a/html2text/compat.py b/html2text/compat.py index fde6c16c..a3226d5b 100644 --- a/html2text/compat.py +++ b/html2text/compat.py @@ -9,4 +9,4 @@ import html.entities as htmlentitydefs import html.parser as HTMLParser -__all__ = ['HTMLParser', 'htmlentitydefs', 'urlparse'] +__all__ = ["HTMLParser", "htmlentitydefs", "urlparse"] diff --git a/html2text/config.py b/html2text/config.py index e2a04f9d..ff96f719 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -37,7 +37,7 @@ GOOGLE_LIST_INDENT = 36 # Values Google and others may use to indicate bold text -BOLD_TEXT_STYLE_VALUES = ('bold', '700', '800', '900') +BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900") IGNORE_ANCHORS = False IGNORE_IMAGES = False @@ -46,8 +46,8 @@ IMAGES_WITH_SIZE = False IGNORE_EMPHASIS = False MARK_CODE = False -DECODE_ERRORS = 'strict' -DEFAULT_IMAGE_ALT = '' +DECODE_ERRORS = "strict" +DEFAULT_IMAGE_ALT = "" PAD_TABLES = False # Convert links with same href and text to format @@ -55,81 +55,93 @@ USE_AUTOMATIC_LINKS = True # For checking space-only lines on line 771 -RE_SPACE = re.compile(r'\s\+') +RE_SPACE = re.compile(r"\s\+") -RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s') -RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') +RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s") +RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s") RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])") # to find links in the text RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") -RE_MD_DOT_MATCHER = re.compile(r""" +RE_MD_DOT_MATCHER = re.compile( + r""" ^ # start of line (\s*\d+) # optional whitespace and a number (\.) # dot (?=\s) # lookahead assert whitespace - """, re.MULTILINE | re.VERBOSE) -RE_MD_PLUS_MATCHER = re.compile(r""" + """, + re.MULTILINE | re.VERBOSE, +) +RE_MD_PLUS_MATCHER = re.compile( + r""" ^ (\s*) (\+) (?=\s) - """, flags=re.MULTILINE | re.VERBOSE) -RE_MD_DASH_MATCHER = re.compile(r""" + """, + flags=re.MULTILINE | re.VERBOSE, +) +RE_MD_DASH_MATCHER = re.compile( + r""" ^ (\s*) (-) (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) # or another dash (header or hr) - """, flags=re.MULTILINE | re.VERBOSE) -RE_SLASH_CHARS = r'\`*_{}[]()#+-.!' -RE_MD_BACKSLASH_MATCHER = re.compile(r''' + """, + flags=re.MULTILINE | re.VERBOSE, +) +RE_SLASH_CHARS = r"\`*_{}[]()#+-.!" +RE_MD_BACKSLASH_MATCHER = re.compile( + r""" (\\) # match one slash (?=[%s]) # followed by a char that requires escaping - ''' % re.escape(RE_SLASH_CHARS), - flags=re.VERBOSE) + """ + % re.escape(RE_SLASH_CHARS), + flags=re.VERBOSE, +) UNIFIABLE = { - 'rsquo': "'", - 'lsquo': "'", - 'rdquo': '"', - 'ldquo': '"', - 'copy': '(C)', - 'mdash': '--', - 'nbsp': ' ', - 'rarr': '->', - 'larr': '<-', - 'middot': '*', - 'ndash': '-', - 'oelig': 'oe', - 'aelig': 'ae', - 'agrave': 'a', - 'aacute': 'a', - 'acirc': 'a', - 'atilde': 'a', - 'auml': 'a', - 'aring': 'a', - 'egrave': 'e', - 'eacute': 'e', - 'ecirc': 'e', - 'euml': 'e', - 'igrave': 'i', - 'iacute': 'i', - 'icirc': 'i', - 'iuml': 'i', - 'ograve': 'o', - 'oacute': 'o', - 'ocirc': 'o', - 'otilde': 'o', - 'ouml': 'o', - 'ugrave': 'u', - 'uacute': 'u', - 'ucirc': 'u', - 'uuml': 'u', - 'lrm': '', - 'rlm': '' + "rsquo": "'", + "lsquo": "'", + "rdquo": '"', + "ldquo": '"', + "copy": "(C)", + "mdash": "--", + "nbsp": " ", + "rarr": "->", + "larr": "<-", + "middot": "*", + "ndash": "-", + "oelig": "oe", + "aelig": "ae", + "agrave": "a", + "aacute": "a", + "acirc": "a", + "atilde": "a", + "auml": "a", + "aring": "a", + "egrave": "e", + "eacute": "e", + "ecirc": "e", + "euml": "e", + "igrave": "i", + "iacute": "i", + "icirc": "i", + "iuml": "i", + "ograve": "o", + "oacute": "o", + "ocirc": "o", + "otilde": "o", + "ouml": "o", + "ugrave": "u", + "uacute": "u", + "ucirc": "u", + "uuml": "u", + "lrm": "", + "rlm": "", } # Format tables in HTML rather than Markdown syntax diff --git a/html2text/utils.py b/html2text/utils.py index fb3f6fe5..4822dc2b 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -6,7 +6,7 @@ def name2cp(k): """Return sname to codepoint""" - if k == 'apos': + if k == "apos": return ord("'") return htmlentitydefs.name2codepoint[k] @@ -17,7 +17,7 @@ def name2cp(k): def hn(tag): - if tag[0] == 'h' and len(tag) == 2: + if tag[0] == "h" and len(tag) == 2: try: n = int(tag[1]) if n in range(1, 10): # pragma: no branch @@ -32,7 +32,7 @@ def dumb_property_dict(style): """ out = { x.strip().lower(): y.strip().lower() - for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z] + for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z] } return out @@ -47,15 +47,15 @@ def dumb_css_parser(data): :rtype: dict """ # remove @import sentences - data += ';' - importIndex = data.find('@import') + data += ";" + importIndex = data.find("@import") while importIndex != -1: - data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] - importIndex = data.find('@import') + data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :] + importIndex = data.find("@import") # parse the css. reverted from dictionary comprehension in order to # support older pythons - elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] + elements = [x.split("{") for x in data.split("}") if "{" in x.strip()] try: elements = {a.strip(): dumb_property_dict(b) for a, b in elements} except ValueError: # pragma: no cover @@ -74,12 +74,12 @@ def element_style(attrs, style_def, parent_style): :rtype: dict """ style = parent_style.copy() - if 'class' in attrs: - for css_class in attrs['class'].split(): - css_style = style_def.get('.' + css_class, {}) + if "class" in attrs: + for css_class in attrs["class"].split(): + css_style = style_def.get("." + css_class, {}) style.update(css_style) - if 'style' in attrs: - immediate_style = dumb_property_dict(attrs['style']) + if "style" in attrs: + immediate_style = dumb_property_dict(attrs["style"]) style.update(immediate_style) return style @@ -93,12 +93,12 @@ def google_list_style(style): :rtype: str """ - if 'list-style-type' in style: - list_style = style['list-style-type'] - if list_style in ['disc', 'circle', 'square', 'none']: - return 'ul' + if "list-style-type" in style: + list_style = style["list-style-type"] + if list_style in ["disc", "circle", "square", "none"]: + return "ul" - return 'ol' + return "ol" def google_has_height(style): @@ -110,7 +110,7 @@ def google_has_height(style): :rtype: bool """ - if 'height' in style: + if "height" in style: return True return False @@ -124,12 +124,12 @@ def google_text_emphasis(style): :rtype: list """ emphasis = [] - if 'text-decoration' in style: - emphasis.append(style['text-decoration']) - if 'font-style' in style: - emphasis.append(style['font-style']) - if 'font-weight' in style: - emphasis.append(style['font-weight']) + if "text-decoration" in style: + emphasis.append(style["text-decoration"]) + if "font-style" in style: + emphasis.append(style["font-style"]) + if "font-weight" in style: + emphasis.append(style["font-weight"]) return emphasis @@ -142,10 +142,10 @@ def google_fixed_width_font(style): :rtype: bool """ - font_family = '' - if 'font-family' in style: - font_family = style['font-family'] - if 'courier new' == font_family or 'consolas' == font_family: + font_family = "" + if "font-family" in style: + font_family = style["font-family"] + if "courier new" == font_family or "consolas" == font_family: return True return False @@ -159,9 +159,9 @@ def list_numbering_start(attrs): :rtype: int or None """ - if 'start' in attrs: + if "start" in attrs: try: - return int(attrs['start']) - 1 + return int(attrs["start"]) - 1 except ValueError: pass @@ -175,7 +175,7 @@ def skipwrap(para, wrap_links, wrap_list_items): return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap - if para[0:4] == ' ' or para[0] == '\t': + if para[0:4] == " " or para[0] == "\t": return True # If the text begins with only two "--", possibly preceded by @@ -187,21 +187,22 @@ def skipwrap(para, wrap_links, wrap_list_items): # I'm not sure what this is for; I thought it was to detect lists, # but there's a
      -inside- case in one of the tests that # also depends upon it. - if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**': + if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": return not wrap_list_items # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either # case optionally proceeded by whitespace), it's a list; don't wrap. - if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \ - config.RE_UNORDERED_LIST_MATCHER.match(stripped): + if config.RE_ORDERED_LIST_MATCHER.match( + stripped + ) or config.RE_UNORDERED_LIST_MATCHER.match(stripped): return True return False def wrapwrite(text): - text = text.encode('utf-8') + text = text.encode("utf-8") try: # Python3 sys.stdout.buffer.write(text) except AttributeError: @@ -248,38 +249,40 @@ def reformat_table(lines, right_margin): padds the cells and returns the new lines """ # find the maximum width of the columns - max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] max_cols = len(max_width) for line in lines: - cols = [x.rstrip() for x in line.split('|')] + cols = [x.rstrip() for x in line.split("|")] num_cols = len(cols) # don't drop any data if colspan attributes result in unequal lengths if num_cols < max_cols: - cols += [''] * (max_cols - num_cols) + cols += [""] * (max_cols - num_cols) elif max_cols < num_cols: - max_width += [ - len(x) + right_margin for x in - cols[-(num_cols - max_cols):] - ] + max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] max_cols = num_cols - max_width = [max(len(x) + right_margin, old_len) - for x, old_len in zip(cols, max_width)] + max_width = [ + max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) + ] # reformat new_lines = [] for line in lines: - cols = [x.rstrip() for x in line.split('|')] - if set(line.strip()) == set('-|'): - filler = '-' - new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) - for x, M in zip(cols, max_width)] + cols = [x.rstrip() for x in line.split("|")] + if set(line.strip()) == set("-|"): + filler = "-" + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] else: - filler = ' ' - new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) - for x, M in zip(cols, max_width)] - new_lines.append('|'.join(new_cols)) + filler = " " + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("|".join(new_cols)) return new_lines @@ -287,23 +290,23 @@ def pad_tables_in_text(text, right_margin=1): """ Provide padding for tables in the text """ - lines = text.split('\n') + lines = text.split("\n") table_buffer, table_started = [], False new_lines = [] for line in lines: # Toggle table started - if (config.TABLE_MARKER_FOR_PAD in line): + if config.TABLE_MARKER_FOR_PAD in line: table_started = not table_started if not table_started: table = reformat_table(table_buffer, right_margin) new_lines.extend(table) table_buffer = [] - new_lines.append('') + new_lines.append("") continue # Process lines if table_started: table_buffer.append(line) else: new_lines.append(line) - new_text = '\n'.join(new_lines) + new_text = "\n".join(new_lines) return new_text diff --git a/setup.cfg b/setup.cfg index 22315413..9596a786 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,12 @@ [bdist_wheel] universal = 1 +[flake8] +max_line_length = 88 +ignore = + E203 + W503 + [isort] combine_as_imports = True include_trailing_comma = True diff --git a/setup.py b/setup.py index ce3069c5..0326ce7f 100644 --- a/setup.py +++ b/setup.py @@ -9,38 +9,34 @@ def readall(f): setup( name="html2text", - version=".".join(map(str, __import__('html2text').__version__)), + version=".".join(map(str, __import__("html2text").__version__)), description="Turn HTML into equivalent Markdown-structured text.", - long_description=readall('README.md'), + long_description=readall("README.md"), long_description_content_type="text/markdown", author="Aaron Swartz", author_email="me@aaronsw.com", - maintainer='Alireza Savand', - maintainer_email='alireza.savand@gmail.com', - url='https://github.com/Alir3z4/html2text/', - platforms='OS Independent', + maintainer="Alireza Savand", + maintainer_email="alireza.savand@gmail.com", + url="https://github.com/Alir3z4/html2text/", + platforms="OS Independent", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: PyPy', + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License (GPL)", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ], python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", - entry_points={ - 'console_scripts': [ - 'html2text = html2text.cli:main', - ] - }, - license='GNU GPL 3', - packages=['html2text'], + entry_points={"console_scripts": ["html2text = html2text.cli:main"]}, + license="GNU GPL 3", + packages=["html2text"], ) diff --git a/test/test_html2text.py b/test/test_html2text.py index 013ee179..1b41c2aa 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -12,12 +12,12 @@ def cleanup_eol(clean_str): - if os.name == 'nt' or sys.platform == 'cygwin': + if os.name == "nt" or sys.platform == "cygwin": # Fix the unwanted CR to CRCRLF replacement # during text pipelining on Windows/cygwin # on cygwin, os.name == 'posix', not nt - clean_str = re.sub(r'\r+', '\r', clean_str) - clean_str = clean_str.replace('\r\n', '\n') + clean_str = re.sub(r"\r+", "\r", clean_str) + clean_str = clean_str.replace("\r\n", "\n") return clean_str @@ -29,105 +29,105 @@ def generate_testdata(): func_args = {} base_fn = os.path.basename(fn).lower() - if base_fn.startswith('default_image_alt'): - module_args['default_image_alt'] = 'Image' - cmdline_args.append('--default-image-alt=Image') + if base_fn.startswith("default_image_alt"): + module_args["default_image_alt"] = "Image" + cmdline_args.append("--default-image-alt=Image") func_args = skip - if base_fn.startswith('google'): - module_args['google_doc'] = True - cmdline_args.append('--googledoc') + if base_fn.startswith("google"): + module_args["google_doc"] = True + cmdline_args.append("--googledoc") func_args = skip - if base_fn.find('unicode') >= 0: - module_args['unicode_snob'] = True + if base_fn.find("unicode") >= 0: + module_args["unicode_snob"] = True # There is no command-line option to control unicode_snob. cmdline_args = skip func_args = skip - if base_fn.find('flip_emphasis') >= 0: - module_args['emphasis_mark'] = '*' - module_args['strong_mark'] = '__' - cmdline_args.append('-e') + if base_fn.find("flip_emphasis") >= 0: + module_args["emphasis_mark"] = "*" + module_args["strong_mark"] = "__" + cmdline_args.append("-e") func_args = skip - if base_fn.find('escape_snob') >= 0: - module_args['escape_snob'] = True - cmdline_args.append('--escape-all') + if base_fn.find("escape_snob") >= 0: + module_args["escape_snob"] = True + cmdline_args.append("--escape-all") func_args = skip - if base_fn.find('table_bypass') >= 0: - module_args['bypass_tables'] = True - cmdline_args.append('--bypass-tables') + if base_fn.find("table_bypass") >= 0: + module_args["bypass_tables"] = True + cmdline_args.append("--bypass-tables") func_args = skip - if base_fn.startswith('table_ignore'): - module_args['ignore_tables'] = True - cmdline_args.append('--ignore-tables') + if base_fn.startswith("table_ignore"): + module_args["ignore_tables"] = True + cmdline_args.append("--ignore-tables") func_args = skip - if base_fn.startswith('bodywidth'): - module_args['body_width'] = 0 - cmdline_args.append('--body-width=0') - func_args['bodywidth'] = 0 + if base_fn.startswith("bodywidth"): + module_args["body_width"] = 0 + cmdline_args.append("--body-width=0") + func_args["bodywidth"] = 0 - if base_fn.startswith('protect_links'): - module_args['protect_links'] = True - cmdline_args.append('--protect-links') + if base_fn.startswith("protect_links"): + module_args["protect_links"] = True + cmdline_args.append("--protect-links") func_args = skip - if base_fn.startswith('images_as_html'): - module_args['images_as_html'] = True - cmdline_args.append('--images-as-html') + if base_fn.startswith("images_as_html"): + module_args["images_as_html"] = True + cmdline_args.append("--images-as-html") func_args = skip - if base_fn.startswith('images_to_alt'): - module_args['images_to_alt'] = True - cmdline_args.append('--images-to-alt') + if base_fn.startswith("images_to_alt"): + module_args["images_to_alt"] = True + cmdline_args.append("--images-to-alt") func_args = skip - if base_fn.startswith('images_with_size'): - module_args['images_with_size'] = True - cmdline_args.append('--images-with-size') + if base_fn.startswith("images_with_size"): + module_args["images_with_size"] = True + cmdline_args.append("--images-with-size") func_args = skip - if base_fn.startswith('single_line_break'): - module_args['body_width'] = 0 - cmdline_args.append('--body-width=0') - module_args['single_line_break'] = True - cmdline_args.append('--single-line-break') + if base_fn.startswith("single_line_break"): + module_args["body_width"] = 0 + cmdline_args.append("--body-width=0") + module_args["single_line_break"] = True + cmdline_args.append("--single-line-break") func_args = skip - if base_fn.startswith('no_inline_links'): - module_args['inline_links'] = False - cmdline_args.append('--reference-links') + if base_fn.startswith("no_inline_links"): + module_args["inline_links"] = False + cmdline_args.append("--reference-links") func_args = skip - if base_fn.startswith('no_wrap_links'): - module_args['wrap_links'] = False - cmdline_args.append('--no-wrap-links') + if base_fn.startswith("no_wrap_links"): + module_args["wrap_links"] = False + cmdline_args.append("--no-wrap-links") func_args = skip - if base_fn.startswith('mark_code'): - module_args['mark_code'] = True - cmdline_args.append('--mark-code') + if base_fn.startswith("mark_code"): + module_args["mark_code"] = True + cmdline_args.append("--mark-code") func_args = skip - if base_fn.startswith('pad_table'): - module_args['pad_tables'] = True - cmdline_args.append('--pad-tables') + if base_fn.startswith("pad_table"): + module_args["pad_tables"] = True + cmdline_args.append("--pad-tables") func_args = skip - if base_fn.startswith('wrap_list_items'): - module_args['wrap_list_items'] = True - cmdline_args.append('--wrap-list-items') + if base_fn.startswith("wrap_list_items"): + module_args["wrap_list_items"] = True + cmdline_args.append("--wrap-list-items") func_args = skip - if base_fn == 'inplace_baseurl_substitution.html': - module_args['baseurl'] = 'http://brettterpstra.com' - module_args['body_width'] = 0 - func_args['baseurl'] = 'http://brettterpstra.com' - func_args['bodywidth'] = 0 + if base_fn == "inplace_baseurl_substitution.html": + module_args["baseurl"] = "http://brettterpstra.com" + module_args["body_width"] = 0 + func_args["baseurl"] = "http://brettterpstra.com" + func_args["bodywidth"] = 0 # CLI doesn't support baseurl. cmdline_args = skip @@ -156,9 +156,9 @@ def test_module(fn, module_args): h = html2text.HTML2Text() h.fn = fn - if module_args.pop('google_doc', False): + if module_args.pop("google_doc", False): h.google_doc = True - h.ul_item_mark = '-' + h.ul_item_mark = "-" h.body_width = 0 h.hide_strikethrough = True @@ -175,11 +175,11 @@ def test_module(fn, module_args): @pytest.mark.parametrize("fn,cmdline_args", generate_command_testdata()) def test_command(fn, cmdline_args): args = list(cmdline_args) - cmd = [sys.executable, '-m', 'html2text'] + cmd = [sys.executable, "-m", "html2text"] - if '--googledoc' in args: - args.remove('--googledoc') - cmd += ['-g', '-d', '-b', '0', '-s'] + if "--googledoc" in args: + args.remove("--googledoc") + cmd += ["-g", "-d", "-b", "0", "-s"] if args: cmd.extend(args) @@ -189,7 +189,7 @@ def test_command(fn, cmdline_args): result = get_baseline(fn) out = subprocess.check_output(cmd) - actual = out.decode('utf8') + actual = out.decode("utf8") actual = cleanup_eol(actual) @@ -205,12 +205,12 @@ def test_function(fn, func_args): def get_baseline_name(fn): - return os.path.splitext(fn)[0] + '.md' + return os.path.splitext(fn)[0] + ".md" def get_baseline(fn): name = get_baseline_name(fn) - with codecs.open(name, mode='r', encoding='utf8') as f: + with codecs.open(name, mode="r", encoding="utf8") as f: out = f.read() out = cleanup_eol(out) return out @@ -218,15 +218,13 @@ def get_baseline(fn): def test_tag_callback(): def _skip_certain_tags(h2t, tag, attrs, start): - if tag == 'b': + if tag == "b": return True h = html2text.HTML2Text() h.tag_callback = _skip_certain_tags ret = h.handle( 'this is a txt and this is a with text and ' - 'some italics too.' - ) - assert ret == ( - 'this is a txt and this is a with text and some _italics_ too.\n\n' + "some italics too." ) + assert ret == ("this is a txt and this is a with text and some _italics_ too.\n\n") diff --git a/test/test_memleak.py b/test/test_memleak.py index 019b4e3f..caf0520a 100644 --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -2,7 +2,7 @@ # See https://github.com/Alir3z4/html2text/issues/13 for more information. -INSTR = 'miow ' +INSTR = "miow " def test_same_string(): @@ -16,4 +16,4 @@ def test_empty_string(): h2t = html2text.HTML2Text() h2t.handle(INSTR) # And even less when the input is empty. - assert h2t.handle('') == '\n\n' + assert h2t.handle("") == "\n\n" diff --git a/tox.ini b/tox.ini index be290b1d..3400cbcf 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,6 @@ [tox] envlist = + black flake8 isort py{27,34,35,36,37,py,py3} @@ -12,7 +13,16 @@ deps = pytest pytest-cov +[testenv:black] +basepython = python3 +commands = + black --check --diff . +deps = + black +skip_install = true + [testenv:flake8] +basepython = python3 commands = flake8 deps = @@ -20,6 +30,7 @@ deps = skip_install = true [testenv:isort] +basepython = python3 commands = isort --check-only --diff deps = From 0f5968fd6e0291c36d7568ca6d1d233fb1bc4811 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Wed, 6 Mar 2019 17:02:00 +0300 Subject: [PATCH 419/479] Fix the black formatting --- html2text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 16cd50e4..d86fe234 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -207,7 +207,7 @@ def previousIndex(self, attrs): self.a list. If the set of attributes is not found, returns None :rtype: int """ - if 'href' not in attrs: + if "href" not in attrs: return None i = -1 for a in self.a: From c453f221fc2ecd1cc43f25c57c022718edab77d2 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 11:56:49 -0800 Subject: [PATCH 420/479] Remove unnecessary coerce to unicode() As the file imports unicode_literals, all literal strings are unicode objects by default. The coerce is a noop. --- html2text/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index d86fe234..0ced97c7 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -167,10 +167,7 @@ def close(self): nbsp = chr(name2cp("nbsp")) else: nbsp = chr(32) - try: - outtext = outtext.replace(unicode(" _place_holder;"), nbsp) - except NameError: - outtext = outtext.replace(" _place_holder;", nbsp) + outtext = outtext.replace(" _place_holder;", nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. From 788c52f29ac3df3df0f710509ef3a4995228d328 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 2 Mar 2019 12:00:42 -0800 Subject: [PATCH 421/479] Use dict comprehension to build unifiable_n data structure --- html2text/__init__.py | 4 ---- html2text/utils.py | 4 +--- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 0ced97c7..42d68b18 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -131,10 +131,6 @@ def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): self.preceding_data = None self.current_tag = None - try: - del unifiable_n[name2cp("nbsp")] - except KeyError: - pass config.UNIFIABLE["nbsp"] = " _place_holder;" def feed(self, data): diff --git a/html2text/utils.py b/html2text/utils.py index 192da204..829c4dd2 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -11,9 +11,7 @@ def name2cp(k): return htmlentitydefs.name2codepoint[k] -unifiable_n = {} -for k in config.UNIFIABLE: - unifiable_n[name2cp(k)] = config.UNIFIABLE[k] +unifiable_n = {name2cp(k): v for k, v in config.UNIFIABLE.items() if k != "nbsp"} def hn(tag): From 6cbe2b6c0af6a270ad0157e0aec7c0763c30905e Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Fri, 7 Jun 2019 17:16:58 +0200 Subject: [PATCH 422/479] Fix description of --no-wrap-links in help message --- html2text/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/cli.py b/html2text/cli.py index 02da153e..2088ef02 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -36,7 +36,7 @@ class bcolors: dest="wrap_links", action="store_false", default=config.WRAP_LINKS, - help="wrap links during conversion", + help="don't wrap links during conversion", ) p.add_argument( "--wrap-list-items", From 83e27aa5cf6bf7642fbb47b859dd0e86b30a5755 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 26 Jun 2019 16:50:46 -0700 Subject: [PATCH 423/479] Pass extra tox cli args to tox https://tox.readthedocs.io/en/latest/config.html#substitutions-for-positional-arguments-in-commands --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 3400cbcf..b8de5676 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ minversion = 1.9 [testenv] commands = - pytest --cov=html2text + pytest --cov=html2text {posargs} deps = pytest pytest-cov From 01f8db6bb53f782db7f49302d8b1b04a3961d735 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 31 Jul 2019 19:26:03 -0700 Subject: [PATCH 424/479] Use bool in HTML2Text.o() rather than int 0/1 The value is being used as a boolean for branching, so use a boolean rather than an int. Easier to understand for the reader. Additionally, always use a keyword argument rather than a bare 0/1 or False/True to provide context to the reader. --- html2text/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 42d68b18..021b7bc4 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -155,7 +155,7 @@ def close(self): HTMLParser.HTMLParser.close(self) self.pbr() - self.o("", 0, "end") + self.o("", force="end") outtext = nochr.join(self.outtextlist) @@ -383,7 +383,7 @@ def handle_tag(self, tag, attrs, start): if tag == "blockquote": if start: self.p() - self.o("> ", 0, 1) + self.o("> ", force=True) self.start = True self.blockquote += 1 else: @@ -685,7 +685,7 @@ def soft_br(self): self.pbr() self.br_toggle = " " - def o(self, data, puredata=0, force=0): + def o(self, data, puredata=False, force=False): """ Deal with indentation and whitespace """ @@ -837,7 +837,7 @@ def handle_data(self, data, entity_char=False): if not self.code and not self.pre and not entity_char: data = escape_md_section(data, snob=self.escape_snob) self.preceding_data = data - self.o(data, 1) + self.o(data, puredata=True) def charref(self, name): if name[0] in ["x", "X"]: From 41d4de84f3a270fe4468033d14785cfa365c099e Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 31 Jul 2019 18:36:00 -0700 Subject: [PATCH 425/479] Simplify the return of boolean expressions Replace pattern: if : return True return False With return --- html2text/utils.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/html2text/utils.py b/html2text/utils.py index 829c4dd2..947e0c56 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -108,10 +108,7 @@ def google_has_height(style): :rtype: bool """ - if "height" in style: - return True - - return False + return "height" in style def google_text_emphasis(style): @@ -143,10 +140,7 @@ def google_fixed_width_font(style): font_family = "" if "font-family" in style: font_family = style["font-family"] - if "courier new" == font_family or "consolas" == font_family: - return True - - return False + return "courier new" == font_family or "consolas" == font_family def list_numbering_start(attrs): @@ -191,12 +185,10 @@ def skipwrap(para, wrap_links, wrap_list_items): # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either # case optionally proceeded by whitespace), it's a list; don't wrap. - if config.RE_ORDERED_LIST_MATCHER.match( - stripped - ) or config.RE_UNORDERED_LIST_MATCHER.match(stripped): - return True - - return False + return bool( + config.RE_ORDERED_LIST_MATCHER.match(stripped) + or config.RE_UNORDERED_LIST_MATCHER.match(stripped) + ) def wrapwrite(text): From 34e34acb5257bb2fe9addcdefd8fcb0e09341541 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 31 Jul 2019 18:21:21 -0700 Subject: [PATCH 426/479] Remove unused temporary variables that are immediately returned --- html2text/utils.py | 7 ++----- test/test_html2text.py | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/html2text/utils.py b/html2text/utils.py index 947e0c56..00542926 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -28,13 +28,11 @@ def dumb_property_dict(style): """ :returns: A hash of css attributes """ - out = { + return { x.strip().lower(): y.strip().lower() for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z] } - return out - def dumb_css_parser(data): """ @@ -298,5 +296,4 @@ def pad_tables_in_text(text, right_margin=1): table_buffer.append(line) else: new_lines.append(line) - new_text = "\n".join(new_lines) - return new_text + return "\n".join(new_lines) diff --git a/test/test_html2text.py b/test/test_html2text.py index 1b41c2aa..69c2ab9e 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -212,8 +212,7 @@ def get_baseline(fn): name = get_baseline_name(fn) with codecs.open(name, mode="r", encoding="utf8") as f: out = f.read() - out = cleanup_eol(out) - return out + return cleanup_eol(out) def test_tag_callback(): From f43c6725f5747b2ef012a888e70cb3a441c3630e Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 31 Jul 2019 18:08:48 -0700 Subject: [PATCH 427/479] Simplify hn() function The new form: - Avoids the unnecessary range() call. On Python 2 this avoids building the data structure in memory. - Avoids the try/except block. The comparison is required either way, so just do that from the start. - Always explicitly return as a value. The old form sometimes fell through and returned None. The new version is also slightly faster. Running the function through Python's timeit showed the following results: old: 0.6434267910080962 new: 0.40665965899825096 --- html2text/utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/html2text/utils.py b/html2text/utils.py index 00542926..620e48b5 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -16,12 +16,10 @@ def name2cp(k): def hn(tag): if tag[0] == "h" and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): - return n - except ValueError: - return 0 + n = tag[1] + if "0" < n <= "9": + return int(n) + return 0 def dumb_property_dict(style): From c4bc42f27551ac910014b663fac1894e4bb56bdd Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 30 Jul 2019 21:53:26 -0700 Subject: [PATCH 428/479] Remove unnecessary duplicate call to name2cp() Reuse the previous value instead. --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 021b7bc4..d2d219e1 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -858,14 +858,14 @@ def entityref(self, c): return config.UNIFIABLE[c] else: try: - name2cp(c) + cp = name2cp(c) except KeyError: return "&" + c + ";" else: if c == "nbsp": return config.UNIFIABLE[c] else: - return chr(name2cp(c)) + return chr(cp) def google_nest_count(self, style): """ From 1d21d5f5ed2b4ee8c3ddff2cb14aef09fb37d102 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 11 Aug 2019 22:37:58 +0300 Subject: [PATCH 429/479] Update version to 2019.8.11 --- ChangeLog.rst | 4 ++-- html2text/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 08907ded..7575d9bf 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,5 +1,5 @@ -0000.0.0 -======== +2019.8.11 +========= ---- * Add support for wrapping list items. diff --git a/html2text/__init__.py b/html2text/__init__.py index d2d219e1..ab008c9c 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -32,7 +32,7 @@ # python3 uses chr nochr = str("") -__version__ = (2018, 1, 9) +__version__ = (2019, 8, 11) # TODO: From 5f895e90b51deae15b901a3950548dcb8cdfd7ec Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 11 Aug 2019 22:41:47 +0300 Subject: [PATCH 430/479] Leave blank place for new changelog --- ChangeLog.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 7575d9bf..4be93538 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,10 @@ +0000.0.00 +========= +---- + + + + 2019.8.11 ========= ---- From da9e86ffd41ddfb0a2084823f084c49b2db62ca0 Mon Sep 17 00:00:00 2001 From: Hugo Smett Date: Sun, 11 Aug 2019 21:42:52 +0200 Subject: [PATCH 431/479] Fix long blockquotes wrapping (#272) * Fix long blockquotes wrapping * Add comments * Remove the trailing whitespaces that were added after wrapping list items & blockquotes --- ChangeLog.rst | 3 +++ html2text/__init__.py | 13 +++++++++++-- test/blockquote_example.html | 2 +- test/blockquote_example.md | 4 +++- test/wrap_list_items_example.md | 8 ++++---- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 4be93538..4e45400a 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -23,6 +23,9 @@ * ``html2text.HTML2Text.unescape()`` * ``html2text.unescape()`` * Fix #208: handle LEFT-TO-RIGHT MARK after a stressed tag. +* Fix long blockquotes wrapping +* Remove the trailing whitespaces that were added after wrapping list items & blockquotes + 2018.1.9 ======== diff --git a/html2text/__init__.py b/html2text/__init__.py index ab008c9c..a50dc30e 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -904,7 +904,13 @@ def optwrap(self, text): if not skipwrap(para, self.wrap_links, self.wrap_list_items): indent = "" if para.startswith(" " + self.ul_item_mark): - indent = " " # For list items. + # list item continuation: add a double indent to the + # new lines + indent = " " + elif para.startswith("> "): + # blockquote continuation: add the greater than symbol + # to the new lines + indent = "> " wrapped = wrap( para, self.body_width, @@ -912,9 +918,12 @@ def optwrap(self, text): subsequent_indent=indent, ) result += "\n".join(wrapped) - if indent or para.endswith(" "): + if para.endswith(" "): result += " \n" newlines = 1 + elif indent: + result += "\n" + newlines = 1 else: result += "\n\n" newlines = 2 diff --git a/test/blockquote_example.html b/test/blockquote_example.html index 1749fb09..ccb3e449 100644 --- a/test/blockquote_example.html +++ b/test/blockquote_example.html @@ -1,3 +1,3 @@
      -The time has come, the Walrus said, to speak of many things. +"The time has come", the Walrus said, "To talk of many things: Of shoes - and ships - and sealing wax - Of cabbages - and kings- And why the sea is boiling hot - And whether pigs have wings."
      diff --git a/test/blockquote_example.md b/test/blockquote_example.md index 69859650..ae264797 100644 --- a/test/blockquote_example.md +++ b/test/blockquote_example.md @@ -1,2 +1,4 @@ -> The time has come, the Walrus said, to speak of many things. +> "The time has come", the Walrus said, "To talk of many things: Of shoes - +> and ships - and sealing wax - Of cabbages - and kings- And why the sea is +> boiling hot - And whether pigs have wings." diff --git a/test/wrap_list_items_example.md b/test/wrap_list_items_example.md index 0708078d..cf80e161 100644 --- a/test/wrap_list_items_example.md +++ b/test/wrap_list_items_example.md @@ -1,14 +1,14 @@ * One two three four five six seven eight nine ten eleven twelve thirteen - fourteen fifteen sixteen seventeen eighteen nineteen twenty. + fourteen fifteen sixteen seventeen eighteen nineteen twenty. * One two three four five six seven eight nine ten eleven twelve thirteen - fourteen fifteen sixteen seventeen eighteen nineteen twenty. + fourteen fifteen sixteen seventeen eighteen nineteen twenty. Text between lists. * One two three four five six seven eight nine ten eleven twelve thirteen - fourteen fifteen sixteen seventeen eighteen nineteen twenty. + fourteen fifteen sixteen seventeen eighteen nineteen twenty. * One two three four five six seven eight nine ten eleven twelve thirteen - fourteen fifteen sixteen seventeen eighteen nineteen twenty. + fourteen fifteen sixteen seventeen eighteen nineteen twenty. Text after list. From e9c2615a0c4886c153793bc69ae7e9a1d4690c23 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Sun, 11 Aug 2019 22:43:50 +0300 Subject: [PATCH 432/479] Update changelog for #272 https://github.com/Alir3z4/html2text/pull/272 --- ChangeLog.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 4e45400a..036fcf2e 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,7 +2,8 @@ ========= ---- - +* Fix long blockquotes wrapping. +* Remove the trailing whitespaces that were added after wrapping list items & blockquotes. 2019.8.11 @@ -23,8 +24,6 @@ * ``html2text.HTML2Text.unescape()`` * ``html2text.unescape()`` * Fix #208: handle LEFT-TO-RIGHT MARK after a stressed tag. -* Fix long blockquotes wrapping -* Remove the trailing whitespaces that were added after wrapping list items & blockquotes 2018.1.9 From b361467894fb277563b4547ec9d4df49f5e0c6e3 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 30 Jul 2019 17:37:25 -0700 Subject: [PATCH 433/479] =?UTF-8?q?Remove=20support=20for=20Python=20?= =?UTF-8?q?=E2=89=A4=203.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python 2 support is nearing end of life. At the end of this year, it will no longer be supported, including for security issues. For more details, see https://pythonclock.org/ All code changes and clean ups that were noticed were done. Changes include: - Remove str/unicode compatibility shims. - Remove encoding cookies from source files. Python 3 interprets all files as utf-8 by default. - Remove unnecessary __future__ imports. The features are now built-in. - Use modern super() syntax. - Update the wheel configuration to reflect it is no longer universal. - Update trove classifiers. Refs #129 --- .travis.yml | 6 ------ ChangeLog.rst | 1 + docs/how_it_works.md | 2 -- html2text/__init__.py | 36 ++++++++++++------------------------ html2text/cli.py | 6 +++--- html2text/compat.py | 12 ------------ html2text/config.py | 2 -- html2text/utils.py | 35 ++++++----------------------------- setup.cfg | 3 --- setup.py | 7 ++----- test/test_html2text.py | 5 ++--- tox.ini | 2 +- 12 files changed, 27 insertions(+), 90 deletions(-) delete mode 100644 html2text/compat.py diff --git a/.travis.yml b/.travis.yml index 14b9f19a..8e1904e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,18 +10,12 @@ matrix: env: TOXENV=flake8 - python: "3.7" env: TOXENV=isort - - python: "2.7" - env: TOXENV=py27 - - python: "3.4" - env: TOXENV=py34 - python: "3.5" env: TOXENV=py35 - python: "3.6" env: TOXENV=py36 - python: "3.7" env: TOXENV=py37 - - python: "pypy2.7-6.0" - env: TOXENV=pypy - python: "pypy3.5-6.0" env: TOXENV=pypy3 diff --git a/ChangeLog.rst b/ChangeLog.rst index 036fcf2e..2bb8af44 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -4,6 +4,7 @@ * Fix long blockquotes wrapping. * Remove the trailing whitespaces that were added after wrapping list items & blockquotes. +* Remove support for Python ≤ 3.4. Now requires Python 3.5+. 2019.8.11 diff --git a/docs/how_it_works.md b/docs/how_it_works.md index eed042b1..3602610d 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -65,8 +65,6 @@ Some functions are: - google_fixed_width_font :check for fixed width font - list_numbering_start :extract numbering from list elem attrs - skipwrap :skip wrap for give para or not? - - wrapwrite :write to buffer - - wrap_read :read from buffer - escape_md :escape md sensitive within other md - escape_md_section :escape md sensitive across whole doc diff --git a/html2text/__init__.py b/html2text/__init__.py index a50dc30e..0fd64b2c 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -1,13 +1,12 @@ -# coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" -from __future__ import division, unicode_literals +import html.entities +import html.parser import re -import sys +import urllib.parse as urlparse from textwrap import wrap from html2text import config -from html2text.compat import HTMLParser, urlparse from html2text.utils import ( dumb_css_parser, element_style, @@ -19,19 +18,11 @@ google_text_emphasis, hn, list_numbering_start, - name2cp, pad_tables_in_text, skipwrap, unifiable_n, ) -try: - chr = unichr - nochr = unicode("") -except NameError: - # python3 uses chr - nochr = str("") - __version__ = (2019, 8, 11) @@ -39,7 +30,7 @@ # Support decoded entities with UNIFIABLE. -class HTML2Text(HTMLParser.HTMLParser): +class HTML2Text(html.parser.HTMLParser): def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): """ Input parameters: @@ -47,10 +38,7 @@ def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): appends lines of text). baseurl: base URL of the document we process """ - kwargs = {} - if sys.version_info >= (3, 4): - kwargs["convert_charrefs"] = False - HTMLParser.HTMLParser.__init__(self, **kwargs) + super().__init__(convert_charrefs=False) # Config options self.split_next_td = False @@ -135,7 +123,7 @@ def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): def feed(self, data): data = data.replace("", "") - HTMLParser.HTMLParser.feed(self, data) + super().feed(data) def handle(self, data): self.feed(data) @@ -152,17 +140,17 @@ def outtextf(self, s): self.lastWasNL = s[-1] == "\n" def close(self): - HTMLParser.HTMLParser.close(self) + super().close() self.pbr() self.o("", force="end") - outtext = nochr.join(self.outtextlist) + outtext = "".join(self.outtextlist) if self.unicode_snob: - nbsp = chr(name2cp("nbsp")) + nbsp = html.entities.html5["nbsp;"] else: - nbsp = chr(32) + nbsp = " " outtext = outtext.replace(" _place_holder;", nbsp) # Clear self.outtextlist to avoid memory leak of its content to @@ -858,14 +846,14 @@ def entityref(self, c): return config.UNIFIABLE[c] else: try: - cp = name2cp(c) + ch = html.entities.html5[c + ";"] except KeyError: return "&" + c + ";" else: if c == "nbsp": return config.UNIFIABLE[c] else: - return chr(cp) + return ch def google_nest_count(self, style): """ diff --git a/html2text/cli.py b/html2text/cli.py index 2088ef02..15a91f68 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,7 +1,7 @@ import argparse +import sys from html2text import HTML2Text, __version__, config -from html2text.utils import wrap_read, wrapwrite def main(): @@ -256,7 +256,7 @@ class bcolors: with open(args.filename, "rb") as fp: data = fp.read() else: - data = wrap_read() + data = sys.stdin.buffer.read() try: data = data.decode(args.encoding, args.decode_errors) @@ -303,4 +303,4 @@ class bcolors: h.open_quote = args.open_quote h.close_quote = args.close_quote - wrapwrite(h.handle(data)) + sys.stdout.write(h.handle(data)) diff --git a/html2text/compat.py b/html2text/compat.py deleted file mode 100644 index a3226d5b..00000000 --- a/html2text/compat.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys - -if sys.version_info[0] == 2: - import htmlentitydefs - import urlparse - import HTMLParser -else: - import urllib.parse as urlparse - import html.entities as htmlentitydefs - import html.parser as HTMLParser - -__all__ = ["HTMLParser", "htmlentitydefs", "urlparse"] diff --git a/html2text/config.py b/html2text/config.py index ff96f719..2bb38b6f 100644 --- a/html2text/config.py +++ b/html2text/config.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re # Use Unicode characters instead of their ascii pseudo-replacements diff --git a/html2text/utils.py b/html2text/utils.py index 620e48b5..f9823214 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -1,17 +1,12 @@ -import sys +import html.entities from html2text import config -from html2text.compat import htmlentitydefs - -def name2cp(k): - """Return sname to codepoint""" - if k == "apos": - return ord("'") - return htmlentitydefs.name2codepoint[k] - - -unifiable_n = {name2cp(k): v for k, v in config.UNIFIABLE.items() if k != "nbsp"} +unifiable_n = { + html.entities.name2codepoint[k]: v + for k, v in config.UNIFIABLE.items() + if k != "nbsp" +} def hn(tag): @@ -187,24 +182,6 @@ def skipwrap(para, wrap_links, wrap_list_items): ) -def wrapwrite(text): - text = text.encode("utf-8") - try: # Python3 - sys.stdout.buffer.write(text) - except AttributeError: - sys.stdout.write(text) - - -def wrap_read(): - """ - :rtype: str - """ - try: - return sys.stdin.read() - except AttributeError: - return sys.stdin.buffer.read() - - def escape_md(text): """ Escapes markdown-sensitive characters within other markdown diff --git a/setup.cfg b/setup.cfg index 9596a786..eea7375e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,3 @@ -[bdist_wheel] -universal = 1 - [flake8] max_line_length = 88 ignore = diff --git a/setup.py b/setup.py index 0326ce7f..6c934cad 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -# coding: utf-8 from setuptools import setup @@ -25,17 +24,15 @@ def readall(f): "License :: OSI Approved :: GNU General Public License (GPL)", "Operating System :: OS Independent", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ], - python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", + python_requires=">=3.5", entry_points={"console_scripts": ["html2text = html2text.cli:main"]}, license="GNU GPL 3", packages=["html2text"], diff --git a/test/test_html2text.py b/test/test_html2text.py index 69c2ab9e..7bdd6797 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -1,4 +1,3 @@ -import codecs import glob import os import re @@ -189,7 +188,7 @@ def test_command(fn, cmdline_args): result = get_baseline(fn) out = subprocess.check_output(cmd) - actual = out.decode("utf8") + actual = out.decode() actual = cleanup_eol(actual) @@ -210,7 +209,7 @@ def get_baseline_name(fn): def get_baseline(fn): name = get_baseline_name(fn) - with codecs.open(name, mode="r", encoding="utf8") as f: + with open(name, encoding="utf-8") as f: out = f.read() return cleanup_eol(out) diff --git a/tox.ini b/tox.ini index b8de5676..094115cd 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,7 @@ envlist = black flake8 isort - py{27,34,35,36,37,py,py3} + py{35,36,37,py3} minversion = 1.9 [testenv] From 23429aee1ba3d6a463a4db1f1664877186e43f59 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sun, 11 Aug 2019 14:34:00 -0700 Subject: [PATCH 434/479] Simplify pareviousIndex by using enumerate() Always return an explicit value. --- html2text/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 0fd64b2c..826efca5 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -190,11 +190,9 @@ def previousIndex(self, attrs): """ if "href" not in attrs: return None - i = -1 - for a in self.a: - i += 1 - match = False + match = False + for i, a in enumerate(self.a): if "href" in a and a["href"] == attrs["href"]: if "title" in a or "title" in attrs: if ( @@ -208,6 +206,7 @@ def previousIndex(self, attrs): if match: return i + return None def handle_emphasis(self, start, tag_style, parent_style): """ From f7c0dcc02e603cab9fab9b6b9222e38192be233b Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sun, 11 Aug 2019 15:48:41 -0700 Subject: [PATCH 435/479] Simplify entityref() Can remove the `else` after the return to dedent the block. --- html2text/__init__.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 0fd64b2c..46f63197 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -844,16 +844,11 @@ def charref(self, name): def entityref(self, c): if not self.unicode_snob and c in config.UNIFIABLE: return config.UNIFIABLE[c] - else: - try: - ch = html.entities.html5[c + ";"] - except KeyError: - return "&" + c + ";" - else: - if c == "nbsp": - return config.UNIFIABLE[c] - else: - return ch + try: + ch = html.entities.html5[c + ";"] + except KeyError: + return "&" + c + ";" + return config.UNIFIABLE[c] if c == "nbsp" else ch def google_nest_count(self, style): """ From 30cf8823dbad7c45161861b70c0155f4427158c8 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sun, 11 Aug 2019 19:25:50 -0700 Subject: [PATCH 436/479] Use bool for handle_tag() start argument --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 46f63197..cfb5f301 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -175,10 +175,10 @@ def handle_entityref(self, c): self.handle_data(ref, True) def handle_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) + self.handle_tag(tag, attrs, start=True) def handle_endtag(self, tag): - self.handle_tag(tag, None, 0) + self.handle_tag(tag, None, start=False) def previousIndex(self, attrs): """ From a2dbf25f78087900b955cdc7f8c7c98b40a0041a Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 12 Aug 2019 20:00:32 -0700 Subject: [PATCH 437/479] Remove unnecessary parenthesis --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index cfb5f301..1ef44e9d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -557,7 +557,7 @@ def link_url(self, link, title=""): if tag in ["ol", "ul"]: # Google Docs create sub lists as top level lists - if (not self.list) and (not self.lastWasList): + if not self.list and not self.lastWasList: self.p() if start: if self.google_doc: @@ -569,7 +569,7 @@ def link_url(self, link, title=""): else: if self.list: self.list.pop() - if (not self.google_doc) and (not self.list): + if not self.google_doc and not self.list: self.o("\n") self.lastWasList = True else: From 0a2b456f1c383adb93c7a2e42b1bdbe0c7424f89 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 12 Aug 2019 18:00:50 -0700 Subject: [PATCH 438/479] Specify a target version for black in tox.ini --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 094115cd..e4711dac 100644 --- a/tox.ini +++ b/tox.ini @@ -16,7 +16,7 @@ deps = [testenv:black] basepython = python3 commands = - black --check --diff . + black --target-version py35 --check --diff . deps = black skip_install = true From 7e31444392d1e1ed790ba0e8afecf3a8074d1190 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 12 Aug 2019 17:49:34 -0700 Subject: [PATCH 439/479] Fix memory link when processing an abbr tag Previously, after the first abbr tag, the abbr_data attribute continued to grow when processing additional data. Now set the attribute to None to avoid appending the unused data. --- ChangeLog.rst | 1 + html2text/__init__.py | 2 +- test/test_memleak.py | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 2bb8af44..8fa4eda8 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,6 +5,7 @@ * Fix long blockquotes wrapping. * Remove the trailing whitespaces that were added after wrapping list items & blockquotes. * Remove support for Python ≤ 3.4. Now requires Python 3.5+. +* Fix memory link when processing a document containing a ```` tag. 2019.8.11 diff --git a/html2text/__init__.py b/html2text/__init__.py index 1ef44e9d..f67430f1 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -430,7 +430,7 @@ def no_preceding_space(self): if self.abbr_title is not None: self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_title = None - self.abbr_data = "" + self.abbr_data = None if tag == "q": if not self.quote: diff --git a/test/test_memleak.py b/test/test_memleak.py index caf0520a..45a3ff68 100644 --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -17,3 +17,10 @@ def test_empty_string(): h2t.handle(INSTR) # And even less when the input is empty. assert h2t.handle("") == "\n\n" + + +def test_abbr_data(): + h2t = html2text.HTML2Text() + result = h2t.handle('

      foo TLA bar

      ') + assert result == "foo TLA bar\n\n *[TLA]: Three Letter Acronym\n\n" + assert h2t.abbr_data is None From 4377506e792ff81dce5e3eb758f56086b1c0c45a Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 25 Sep 2019 00:36:01 -0700 Subject: [PATCH 440/479] Correct nested list syntax in changelog Need a space before and after the indented list. --- ChangeLog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 8fa4eda8..769a1b0b 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -20,11 +20,13 @@ * Add ``__main__.py`` module to allow running the CLI using ``python -m html2text ...``. * Fix #238: correct spacing when a HTML entity follows a non-stressed tags which follow a stressed tag. * Remove unused or deprecated: + * ``html2text.compat.escape()`` * ``html2text.config.RE_UNESCAPE`` * ``html2text.HTML2Text.replaceEntities()`` * ``html2text.HTML2Text.unescape()`` * ``html2text.unescape()`` + * Fix #208: handle LEFT-TO-RIGHT MARK after a stressed tag. From 30402ae4a0c305c713613751765d76e1186da327 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Wed, 25 Sep 2019 00:49:00 -0700 Subject: [PATCH 441/479] =?UTF-8?q?Correct=20typo=20in=20changelog:=20"mem?= =?UTF-8?q?ory=20link"=20=E2=86=92=20"memory=20leak"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 769a1b0b..8bc92547 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,7 +5,7 @@ * Fix long blockquotes wrapping. * Remove the trailing whitespaces that were added after wrapping list items & blockquotes. * Remove support for Python ≤ 3.4. Now requires Python 3.5+. -* Fix memory link when processing a document containing a ```` tag. +* Fix memory leak when processing a document containing a ```` tag. 2019.8.11 From db793dc0db46a235351e9e6e96bde4bc79112141 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Thu, 26 Sep 2019 03:23:47 -0700 Subject: [PATCH 442/479] Document bug fixes in upcoming release --- ChangeLog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 8bc92547..5e41f16c 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -6,6 +6,8 @@ * Remove the trailing whitespaces that were added after wrapping list items & blockquotes. * Remove support for Python ≤ 3.4. Now requires Python 3.5+. * Fix memory leak when processing a document containing a ```` tag. +* Fix ``AttributeError`` when reading text from stdin. +* Fix ``UnicodeEncodeError`` when writing output to stdout. 2019.8.11 From ff4b98c715e8fc36cea96538921436c35bfcac84 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Thu, 26 Sep 2019 03:25:08 -0700 Subject: [PATCH 443/479] Update version to 2019.9.26 --- ChangeLog.rst | 2 +- html2text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 5e41f16c..7e631c4d 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -0000.0.00 +2019.9.26 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index bf3ce0f5..f95b8412 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -23,7 +23,7 @@ unifiable_n, ) -__version__ = (2019, 8, 11) +__version__ = (2019, 9, 26) # TODO: From 831136636eab882b1ef267d4eede14a19a7cdb99 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Thu, 26 Sep 2019 03:59:11 -0700 Subject: [PATCH 444/479] Remove unused and unmaintained requirements-dev.txt file --- requirements-dev.txt | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 488eb7de..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -coverage==4.4.2 -pypandoc==1.4 -wheel==0.30.0 -flake8==3.5.0 From 0005069dcae2ba4812f52cd16b8d27d89fb3352b Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 8 Oct 2019 18:51:11 -0700 Subject: [PATCH 445/479] Simplify expression to avoid unnecessary loop --- html2text/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index f95b8412..37f38e7d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -717,8 +717,7 @@ def o(self, data, puredata=False, force=False): if not self.list: bq += " " # else: list content is already partially indented - for i in range(len(self.list)): - bq += " " + bq += " " * len(self.list) data = data.replace("\n", "\n" + bq) if self.startpre: From a80aad87149962d52b9135ca6daf6182f40224cd Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Tue, 8 Oct 2019 20:09:35 -0700 Subject: [PATCH 446/479] Use relative imports internally --- html2text/__init__.py | 4 ++-- html2text/__main__.py | 2 +- html2text/cli.py | 2 +- html2text/utils.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 37f38e7d..00b6d78b 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -6,8 +6,8 @@ import urllib.parse as urlparse from textwrap import wrap -from html2text import config -from html2text.utils import ( +from . import config +from .utils import ( dumb_css_parser, element_style, escape_md, diff --git a/html2text/__main__.py b/html2text/__main__.py index 6a5a4f85..4e28416e 100644 --- a/html2text/__main__.py +++ b/html2text/__main__.py @@ -1,3 +1,3 @@ -from html2text.cli import main +from .cli import main main() diff --git a/html2text/cli.py b/html2text/cli.py index 15a91f68..b86e9027 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -1,7 +1,7 @@ import argparse import sys -from html2text import HTML2Text, __version__, config +from . import HTML2Text, __version__, config def main(): diff --git a/html2text/utils.py b/html2text/utils.py index f9823214..8866bd55 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -1,6 +1,6 @@ import html.entities -from html2text import config +from . import config unifiable_n = { html.entities.name2codepoint[k]: v From 10171cfdbc99cafc29f65db1fdaa4f6df87f6518 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 12 Aug 2019 18:22:41 -0700 Subject: [PATCH 447/479] Use mypy to do static type checking of html2text As code is updated and refactored, helps provide confidence in correctness. Adjust the code a bit to allow for stricter type usage. Include the marker file py.typed in the distribution. https://mypy.readthedocs.io/en/latest/index.html --- .gitignore | 1 + .travis.yml | 2 + ChangeLog.rst | 6 ++ html2text/__init__.py | 174 +++++++++++++++++++++++------------------- html2text/cli.py | 6 +- html2text/elements.py | 18 +++++ html2text/py.typed | 0 html2text/typing.py | 3 + html2text/utils.py | 43 ++++++----- setup.cfg | 3 + setup.py | 2 + tox.ini | 6 ++ 12 files changed, 165 insertions(+), 99 deletions(-) create mode 100644 html2text/elements.py create mode 100644 html2text/py.typed create mode 100644 html2text/typing.py diff --git a/.gitignore b/.gitignore index a219c128..047a2c10 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ env/ .vscode .tox/ htmlcov/ +.mypy_cache/ diff --git a/.travis.yml b/.travis.yml index 8e1904e7..634f342b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,8 @@ matrix: env: TOXENV=black - python: "3.7" env: TOXENV=flake8 + - python: "3.7" + env: TOXENV=mypy - python: "3.7" env: TOXENV=isort - python: "3.5" diff --git a/ChangeLog.rst b/ChangeLog.rst index 7e631c4d..1ecf5673 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,9 @@ +UNRELEASED +========== +---- + +* Add type annotations. + 2019.9.26 ========= ---- diff --git a/html2text/__init__.py b/html2text/__init__.py index 00b6d78b..d8e41a1d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -5,8 +5,11 @@ import re import urllib.parse as urlparse from textwrap import wrap +from typing import Dict, List, Optional, Tuple, Union from . import config +from .elements import AnchorElement, ListElement +from .typing import OutCallback from .utils import ( dumb_css_parser, element_style, @@ -31,7 +34,12 @@ class HTML2Text(html.parser.HTMLParser): - def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): + def __init__( + self, + out: Optional[OutCallback] = None, + baseurl: str = "", + bodywidth: int = config.BODY_WIDTH, + ) -> None: """ Input parameters: out: possible custom replacement for self.outtextf (which @@ -82,20 +90,20 @@ def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): self.out = out # empty list to store output characters before they are "joined" - self.outtextlist = [] + self.outtextlist = [] # type: List[str] self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = True self.space = False - self.a = [] - self.astack = [] - self.maybe_automatic_link = None + self.a = [] # type: List[AnchorElement] + self.astack = [] # type: List[Optional[Dict[str, Optional[str]]]] + self.maybe_automatic_link = None # type: Optional[str] self.empty_link = False self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") self.acount = 0 - self.list = [] + self.list = [] # type: List[ListElement] self.blockquote = 0 self.pre = False self.startpre = False @@ -105,42 +113,47 @@ def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): self.lastWasNL = False self.lastWasList = False self.style = 0 - self.style_def = {} - self.tag_stack = [] + self.style_def = {} # type: Dict[str, Dict[str, str]] + self.tag_stack = ( + [] + ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] self.emphasis = 0 self.drop_white_space = 0 self.inheader = False - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later + # Current abbreviation definition + self.abbr_title = None # type: Optional[str] + # Last inner HTML (for abbr being defined) + self.abbr_data = None # type: Optional[str] + # Stack of abbreviations to write later + self.abbr_list = {} # type: Dict[str, str] self.baseurl = baseurl self.stressed = False self.preceding_stressed = False - self.preceding_data = None - self.current_tag = None + self.preceding_data = "" + self.current_tag = "" config.UNIFIABLE["nbsp"] = " _place_holder;" - def feed(self, data): + def feed(self, data: str) -> None: data = data.replace("", "") super().feed(data) - def handle(self, data): + def handle(self, data: str) -> str: self.feed(data) self.feed("") - markdown = self.optwrap(self.close()) + markdown = self.optwrap(self.finish()) if self.pad_tables: return pad_tables_in_text(markdown) else: return markdown - def outtextf(self, s): + def outtextf(self, s: str) -> None: self.outtextlist.append(s) if s: self.lastWasNL = s[-1] == "\n" - def close(self): - super().close() + def finish(self) -> str: + self.close() self.pbr() self.o("", force="end") @@ -159,10 +172,10 @@ def close(self): return outtext - def handle_charref(self, c): + def handle_charref(self, c: str) -> None: self.handle_data(self.charref(c), True) - def handle_entityref(self, c): + def handle_entityref(self, c: str) -> None: ref = self.entityref(c) # ref may be an empty string (e.g. for ‎/‏ markers that should @@ -174,13 +187,13 @@ def handle_entityref(self, c): if ref: self.handle_data(ref, True) - def handle_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, start=True) + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + self.handle_tag(tag, dict(attrs), start=True) - def handle_endtag(self, tag): - self.handle_tag(tag, None, start=False) + def handle_endtag(self, tag: str) -> None: + self.handle_tag(tag, {}, start=False) - def previousIndex(self, attrs): + def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: """ :type attrs: dict @@ -193,12 +206,12 @@ def previousIndex(self, attrs): match = False for i, a in enumerate(self.a): - if "href" in a and a["href"] == attrs["href"]: - if "title" in a or "title" in attrs: + if "href" in a.attrs and a.attrs["href"] == attrs["href"]: + if "title" in a.attrs or "title" in attrs: if ( - "title" in a + "title" in a.attrs and "title" in attrs - and a["title"] == attrs["title"] + and a.attrs["title"] == attrs["title"] ): match = True else: @@ -208,7 +221,9 @@ def previousIndex(self, attrs): return i return None - def handle_emphasis(self, start, tag_style, parent_style): + def handle_emphasis( + self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str] + ) -> None: """ Handles various text emphases """ @@ -279,13 +294,10 @@ def handle_emphasis(self, start, tag_style, parent_style): if strikethrough: self.quiet -= 1 - def handle_tag(self, tag, attrs, start): + def handle_tag( + self, tag: str, attrs: Dict[str, Optional[str]], start: bool + ) -> None: self.current_tag = tag - # attrs is None for endtags - if attrs is None: - attrs = {} - else: - attrs = dict(attrs) if self.tag_callback is not None: if self.tag_callback(self, tag, attrs, start) is True: @@ -308,7 +320,7 @@ def handle_tag(self, tag, attrs, start): # need the attributes of the parent nodes in order to get a # complete style description for the current element. we assume # that google docs export well formed html. - parent_style = {} + parent_style = {} # type: Dict[str, str] if start: if self.tag_stack: parent_style = self.tag_stack[-1][2] @@ -377,8 +389,10 @@ def handle_tag(self, tag, attrs, start): self.blockquote -= 1 self.p() - def no_preceding_space(self): - return self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1]) + def no_preceding_space(self: HTML2Text) -> bool: + return bool( + self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1]) + ) if tag in ["em", "i", "u"] and not self.ignore_emphasis: if start and no_preceding_space(self): @@ -427,6 +441,7 @@ def no_preceding_space(self): self.abbr_title = attrs["title"] else: if self.abbr_title is not None: + assert self.abbr_data is not None self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_title = None self.abbr_data = None @@ -438,7 +453,7 @@ def no_preceding_space(self): self.o(self.close_quote) self.quote = not self.quote - def link_url(self, link, title=""): + def link_url(self: HTML2Text, link: str, title: str = "") -> None: url = urlparse.urljoin(self.baseurl, link) title = ' "{}"'.format(title) if title.strip() else "" self.o("]({url}{title})".format(url=escape_md(url), title=title)) @@ -463,31 +478,28 @@ def link_url(self, link, title=""): if self.maybe_automatic_link and not self.empty_link: self.maybe_automatic_link = None elif a: + assert a["href"] is not None if self.empty_link: self.o("[") self.empty_link = False self.maybe_automatic_link = None if self.inline_links: - try: - title = a["title"] if a["title"] else "" - title = escape_md(title) - except KeyError: - link_url(self, a["href"], "") - else: - link_url(self, a["href"], title) + title = a.get("title") or "" + title = escape_md(title) + link_url(self, a["href"], title) else: i = self.previousIndex(a) if i is not None: - a = self.a[i] + a_props = self.a[i] else: self.acount += 1 - a["count"] = self.acount - a["outcount"] = self.outcount - self.a.append(a) - self.o("][" + str(a["count"]) + "]") + a_props = AnchorElement(a, self.acount, self.outcount) + self.a.append(a_props) + self.o("][" + str(a_props.count) + "]") if tag == "img" and start and not self.ignore_images: if "src" in attrs: + assert attrs["src"] is not None if not self.images_to_alt: attrs["href"] = attrs["src"] alt = attrs.get("alt") or self.default_image_alt @@ -499,8 +511,10 @@ def link_url(self, link, title=""): ): self.o("
    2. s > 9 correctly. self.o(" " * nest_count) - if li["name"] == "ul": + if li.name == "ul": self.o(self.ul_item_mark + " ") - elif li["name"] == "ol": - li["num"] += 1 - self.o(str(li["num"]) + ". ") + elif li.name == "ol": + li.num += 1 + self.o(str(li.num) + ". ") self.start = True if tag in ["table", "tr", "td", "th"]: @@ -658,21 +671,23 @@ def link_url(self, link, title=""): self.p() # TODO: Add docstring for these one letter functions - def pbr(self): + def pbr(self) -> None: "Pretty print has a line break" if self.p_p == 0: self.p_p = 1 - def p(self): + def p(self) -> None: "Set pretty print to 1 or 2 lines" self.p_p = 1 if self.single_line_break else 2 - def soft_br(self): + def soft_br(self) -> None: "Soft breaks" self.pbr() self.br_toggle = " " - def o(self, data, puredata=False, force=False): + def o( + self, data: str, puredata: bool = False, force: Union[bool, str] = False + ) -> None: """ Deal with indentation and whitespace """ @@ -755,15 +770,16 @@ def o(self, data, puredata=False, force=False): newa = [] for link in self.a: - if self.outcount > link["outcount"]: + if self.outcount > link.outcount: self.out( " [" - + str(link["count"]) + + str(link.count) + "]: " - + urlparse.urljoin(self.baseurl, link["href"]) + + urlparse.urljoin(self.baseurl, link.attrs["href"]) ) - if "title" in link: - self.out(" (" + link["title"] + ")") + if "title" in link.attrs: + assert link.attrs["title"] is not None + self.out(" (" + link.attrs["title"] + ")") self.out("\n") else: newa.append(link) @@ -782,7 +798,7 @@ def o(self, data, puredata=False, force=False): self.out(data) self.outcount += 1 - def handle_data(self, data, entity_char=False): + def handle_data(self, data: str, entity_char: bool = False) -> None: if not data: # Data may be empty for some HTML entities. For example, # LEFT-TO-RIGHT MARK. @@ -825,7 +841,7 @@ def handle_data(self, data, entity_char=False): self.preceding_data = data self.o(data, puredata=True) - def charref(self, name): + def charref(self, name: str) -> str: if name[0] in ["x", "X"]: c = int(name[1:], 16) else: @@ -839,7 +855,7 @@ def charref(self, name): except ValueError: # invalid unicode return "" - def entityref(self, c): + def entityref(self, c: str) -> str: if not self.unicode_snob and c in config.UNIFIABLE: return config.UNIFIABLE[c] try: @@ -848,7 +864,7 @@ def entityref(self, c): return "&" + c + ";" return config.UNIFIABLE[c] if c == "nbsp" else ch - def google_nest_count(self, style): + def google_nest_count(self, style: Dict[str, str]) -> int: """ Calculate the nesting count of google doc lists @@ -862,7 +878,7 @@ def google_nest_count(self, style): return nest_count - def optwrap(self, text): + def optwrap(self, text: str) -> str: """ Wrap all paragraphs in the provided text. @@ -923,7 +939,7 @@ def optwrap(self, text): return result -def html2text(html, baseurl="", bodywidth=None): +def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str: if bodywidth is None: bodywidth = config.BODY_WIDTH h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) diff --git a/html2text/cli.py b/html2text/cli.py index b86e9027..30a362e4 100644 --- a/html2text/cli.py +++ b/html2text/cli.py @@ -4,7 +4,7 @@ from . import HTML2Text, __version__, config -def main(): +def main() -> None: baseurl = "" class bcolors: @@ -259,7 +259,7 @@ class bcolors: data = sys.stdin.buffer.read() try: - data = data.decode(args.encoding, args.decode_errors) + html = data.decode(args.encoding, args.decode_errors) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += " Use the " + bcolors.OKGREEN @@ -303,4 +303,4 @@ class bcolors: h.open_quote = args.open_quote h.close_quote = args.close_quote - sys.stdout.write(h.handle(data)) + sys.stdout.write(h.handle(html)) diff --git a/html2text/elements.py b/html2text/elements.py new file mode 100644 index 00000000..2533ec08 --- /dev/null +++ b/html2text/elements.py @@ -0,0 +1,18 @@ +from typing import Dict, Optional + + +class AnchorElement: + __slots__ = ["attrs", "count", "outcount"] + + def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int): + self.attrs = attrs + self.count = count + self.outcount = outcount + + +class ListElement: + __slots__ = ["name", "num"] + + def __init__(self, name: str, num: int): + self.name = name + self.num = num diff --git a/html2text/py.typed b/html2text/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/html2text/typing.py b/html2text/typing.py new file mode 100644 index 00000000..6e17fed2 --- /dev/null +++ b/html2text/typing.py @@ -0,0 +1,3 @@ +class OutCallback: + def __call__(self, s: str) -> None: + ... diff --git a/html2text/utils.py b/html2text/utils.py index 8866bd55..e6256d1e 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -1,4 +1,5 @@ import html.entities +from typing import Dict, List, Optional from . import config @@ -9,7 +10,7 @@ } -def hn(tag): +def hn(tag: str) -> int: if tag[0] == "h" and len(tag) == 2: n = tag[1] if "0" < n <= "9": @@ -17,7 +18,7 @@ def hn(tag): return 0 -def dumb_property_dict(style): +def dumb_property_dict(style: str) -> Dict[str, str]: """ :returns: A hash of css attributes """ @@ -27,7 +28,7 @@ def dumb_property_dict(style): } -def dumb_css_parser(data): +def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]: """ :type data: str @@ -44,16 +45,20 @@ def dumb_css_parser(data): # parse the css. reverted from dictionary comprehension in order to # support older pythons - elements = [x.split("{") for x in data.split("}") if "{" in x.strip()] + pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()] try: - elements = {a.strip(): dumb_property_dict(b) for a, b in elements} + elements = {a.strip(): dumb_property_dict(b) for a, b in pairs} except ValueError: elements = {} # not that important return elements -def element_style(attrs, style_def, parent_style): +def element_style( + attrs: Dict[str, Optional[str]], + style_def: Dict[str, Dict[str, str]], + parent_style: Dict[str, str], +) -> Dict[str, str]: """ :type attrs: dict :type style_def: dict @@ -64,17 +69,19 @@ def element_style(attrs, style_def, parent_style): """ style = parent_style.copy() if "class" in attrs: + assert attrs["class"] is not None for css_class in attrs["class"].split(): css_style = style_def.get("." + css_class, {}) style.update(css_style) if "style" in attrs: + assert attrs["style"] is not None immediate_style = dumb_property_dict(attrs["style"]) style.update(immediate_style) return style -def google_list_style(style): +def google_list_style(style: Dict[str, str]) -> str: """ Finds out whether this is an ordered or unordered list @@ -90,7 +97,7 @@ def google_list_style(style): return "ol" -def google_has_height(style): +def google_has_height(style: Dict[str, str]) -> bool: """ Check if the style of the element has the 'height' attribute explicitly defined @@ -102,7 +109,7 @@ def google_has_height(style): return "height" in style -def google_text_emphasis(style): +def google_text_emphasis(style: Dict[str, str]) -> List[str]: """ :type style: dict @@ -120,7 +127,7 @@ def google_text_emphasis(style): return emphasis -def google_fixed_width_font(style): +def google_fixed_width_font(style: Dict[str, str]) -> bool: """ Check if the css of the current element defines a fixed width font @@ -134,7 +141,7 @@ def google_fixed_width_font(style): return "courier new" == font_family or "consolas" == font_family -def list_numbering_start(attrs): +def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: """ Extract numbering from list element attributes @@ -143,6 +150,7 @@ def list_numbering_start(attrs): :rtype: int or None """ if "start" in attrs: + assert attrs["start"] is not None try: return int(attrs["start"]) - 1 except ValueError: @@ -151,7 +159,7 @@ def list_numbering_start(attrs): return 0 -def skipwrap(para, wrap_links, wrap_list_items): +def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool: # If it appears to contain a link # don't wrap if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links: @@ -182,7 +190,7 @@ def skipwrap(para, wrap_links, wrap_list_items): ) -def escape_md(text): +def escape_md(text: str) -> str: """ Escapes markdown-sensitive characters within other markdown constructs. @@ -190,7 +198,7 @@ def escape_md(text): return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) -def escape_md_section(text, snob=False): +def escape_md_section(text: str, snob: bool = False) -> str: """ Escapes markdown-sensitive characters across whole document sections. """ @@ -206,7 +214,7 @@ def escape_md_section(text, snob=False): return text -def reformat_table(lines, right_margin): +def reformat_table(lines: List[str], right_margin: int) -> List[str]: """ Given the lines of a table padds the cells and returns the new lines @@ -249,12 +257,13 @@ def reformat_table(lines, right_margin): return new_lines -def pad_tables_in_text(text, right_margin=1): +def pad_tables_in_text(text: str, right_margin: int = 1) -> str: """ Provide padding for tables in the text """ lines = text.split("\n") - table_buffer, table_started = [], False + table_buffer = [] # type: List[str] + table_started = False new_lines = [] for line in lines: # Toggle table started diff --git a/setup.cfg b/setup.cfg index eea7375e..844abdeb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,3 +9,6 @@ combine_as_imports = True include_trailing_comma = True line_length = 88 multi_line_output = 3 + +[mypy] +python_version = 3.5 diff --git a/setup.py b/setup.py index 6c934cad..5a8cd699 100644 --- a/setup.py +++ b/setup.py @@ -36,4 +36,6 @@ def readall(f): entry_points={"console_scripts": ["html2text = html2text.cli:main"]}, license="GNU GPL 3", packages=["html2text"], + package_data={"html2text": ["py.typed"]}, + zip_safe=False, ) diff --git a/tox.ini b/tox.ini index e4711dac..84114d0f 100644 --- a/tox.ini +++ b/tox.ini @@ -3,6 +3,7 @@ envlist = black flake8 isort + mypy py{35,36,37,py3} minversion = 1.9 @@ -36,3 +37,8 @@ commands = deps = isort skip_install = true + +[testenv:mypy] +commands = mypy --strict html2text +deps = mypy +skip_install = true From 951363496f1bfe55cc341292a461f76213daf5bb Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 12 Oct 2019 09:20:14 -0700 Subject: [PATCH 448/479] Xenial is now the default on Travis --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 634f342b..894966bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,3 @@ -dist: xenial language: python cache: pip From c5017c4651e726d8313f077943896ffabbf3b6dc Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 12 Oct 2019 09:19:33 -0700 Subject: [PATCH 449/479] Update PyPy3 on Travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 894966bb..5161ec00 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ matrix: env: TOXENV=py36 - python: "3.7" env: TOXENV=py37 - - python: "pypy3.5-6.0" + - python: "pypy3" env: TOXENV=pypy3 install: From bbaf73742898ef0ae620a35fee4db2af408c8737 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 12 Oct 2019 09:24:44 -0700 Subject: [PATCH 450/479] Test on upcoming Python 3.8 --- .travis.yml | 2 ++ setup.py | 1 + tox.ini | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5161ec00..fdf6d514 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,6 +17,8 @@ matrix: env: TOXENV=py36 - python: "3.7" env: TOXENV=py37 + - python: 3.8-dev + env: TOXENV=py38 - python: "pypy3" env: TOXENV=pypy3 diff --git a/setup.py b/setup.py index 5a8cd699..8d8a49bb 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ def readall(f): "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", diff --git a/tox.ini b/tox.ini index 84114d0f..fb4403cf 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ envlist = flake8 isort mypy - py{35,36,37,py3} + py{35,36,37,38,py3} minversion = 1.9 [testenv] From 3af173803d23dad8dd5369602dab19a656757d00 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 28 Oct 2019 17:16:36 -0700 Subject: [PATCH 451/479] Test on Python 3.8 final --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fdf6d514..10b062eb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ matrix: env: TOXENV=py36 - python: "3.7" env: TOXENV=py37 - - python: 3.8-dev + - python: 3.8 env: TOXENV=py38 - python: "pypy3" env: TOXENV=pypy3 From 2d2c7023e6498611e567fb68727ca4628c187b77 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 28 Oct 2019 17:38:28 -0700 Subject: [PATCH 452/479] Use setuptools declarative syntax through setup.cfg https://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files --- setup.cfg | 40 ++++++++++++++++++++++++++++++++++++++++ setup.py | 41 +---------------------------------------- 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index 844abdeb..3efd0479 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,43 @@ +[metadata] +name = html2text +version = attr: html2text.__version__ +description = Turn HTML into equivalent Markdown-structured text. +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/Alir3z4/html2text/ +author = Aaron Swartz +author_email = me@aaronsw.com +maintainer = Alireza Savand +maintainer_email = alireza.savand@gmail.com +license = GNU GPL 3 +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + License :: OSI Approved :: GNU General Public License (GPL) + Operating System :: OS Independent + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.5 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: Implementation :: CPython + Programming Language :: Python :: Implementation :: PyPy +platform = OS Independent + +[options] +zip_safe = False +packages = html2text +python_requires = >=3.5 + +[options.entry_points] +console_scripts = + html2text = html2text.cli:main + +[options.package_data] +html2text = py.typed + [flake8] max_line_length = 88 ignore = diff --git a/setup.py b/setup.py index 8d8a49bb..60684932 100644 --- a/setup.py +++ b/setup.py @@ -1,42 +1,3 @@ from setuptools import setup - -def readall(f): - with open(f) as fp: - return fp.read() - - -setup( - name="html2text", - version=".".join(map(str, __import__("html2text").__version__)), - description="Turn HTML into equivalent Markdown-structured text.", - long_description=readall("README.md"), - long_description_content_type="text/markdown", - author="Aaron Swartz", - author_email="me@aaronsw.com", - maintainer="Alireza Savand", - maintainer_email="alireza.savand@gmail.com", - url="https://github.com/Alir3z4/html2text/", - platforms="OS Independent", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: GNU General Public License (GPL)", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - ], - python_requires=">=3.5", - entry_points={"console_scripts": ["html2text = html2text.cli:main"]}, - license="GNU GPL 3", - packages=["html2text"], - package_data={"html2text": ["py.typed"]}, - zip_safe=False, -) +setup() From bd982bd31bc6ab6b4f03b5ca56b426c50c975725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Freitag?= Date: Thu, 16 Jan 2020 14:03:01 +0100 Subject: [PATCH 453/479] Speed up skipwrap function Short-circuit evaluation when `wrap_links` is True (the default). Evaluating a boolean expression is much cheaper than searching a paragraph for a match. Any link present in the paragraph causes the wrapping to be skipped. Instead of searching for all matches, stop at the first match. --- html2text/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index e6256d1e..0c7e6c54 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -162,7 +162,7 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool: # If it appears to contain a link # don't wrap - if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links: + if not wrap_links and config.RE_LINK.search(para): return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap From 15b308d0da9a9fb9a63970d847bbfe7bc5be05b5 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sun, 12 Jan 2020 10:39:22 -0800 Subject: [PATCH 454/479] Test the --unicode-snob CLI argument --- test/test_html2text.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index 7bdd6797..39fa37e3 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -40,8 +40,7 @@ def generate_testdata(): if base_fn.find("unicode") >= 0: module_args["unicode_snob"] = True - # There is no command-line option to control unicode_snob. - cmdline_args = skip + cmdline_args.append("--unicode-snob") func_args = skip if base_fn.find("flip_emphasis") >= 0: From c06817644790d52a462159d7a5e4b4218308e248 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Thu, 16 Jan 2020 06:14:40 -0800 Subject: [PATCH 455/479] Prepare for the next release --- ChangeLog.rst | 8 ++++++-- html2text/__init__.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 1ecf5673..85144358 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,8 +1,12 @@ -UNRELEASED -========== +2020.1.16 +========= ---- * Add type annotations. +* Add support for Python 3.8. +* Performance improvements when ``wrap_links`` is ``False`` (the default). +* Configure setuptools using setup.cfg. + 2019.9.26 ========= diff --git a/html2text/__init__.py b/html2text/__init__.py index d8e41a1d..07a3dab1 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -26,7 +26,7 @@ unifiable_n, ) -__version__ = (2019, 9, 26) +__version__ = (2020, 1, 16) # TODO: From 058d6df53214e4455978b789a8608664356d1b7a Mon Sep 17 00:00:00 2001 From: Ludovic VAUGEOIS PEPIN Date: Tue, 3 Mar 2020 19:34:39 +0100 Subject: [PATCH 456/479] Make padded tables more similar to pandoc's pipe_tables I've been looking for a pure python replacement to pypandoc to convert HTML to markdown. I can't seem to find any configuration that would have pandoc and this package format tables in the same manner. This PR would make this project padded table look a lot like pandoc's pipe tables https://pandoc.org/MANUAL.html#extension-pipe_tables --- html2text/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/utils.py b/html2text/utils.py index 0c7e6c54..2051d23e 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -247,13 +247,14 @@ def reformat_table(lines: List[str], right_margin: int) -> List[str]: x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width) ] + new_lines.append("|-" + "|".join(new_cols) + "|") else: filler = " " new_cols = [ x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width) ] - new_lines.append("|".join(new_cols)) + new_lines.append("| " + "|".join(new_cols) + "|") return new_lines From 61df23569c00a1dd6f789ecea244cd666a93bb76 Mon Sep 17 00:00:00 2001 From: Ludovic VAUGEOIS PEPIN Date: Tue, 3 Mar 2020 19:53:28 +0100 Subject: [PATCH 457/479] Update pad_table.md --- test/pad_table.md | 56 +++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/test/pad_table.md b/test/pad_table.md index 167fc6ec..81e8fcb0 100644 --- a/test/pad_table.md +++ b/test/pad_table.md @@ -6,37 +6,37 @@ With some text, `code`, **bolds** and _italics_. Displaynone text -Header 1 | Header 2 | Header 3 ------------------|-----------|---------------------------------------------- -Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! -Content 1 longer | Content 2 | blah -Content | Content 2 | blah -t | Content 2 | blah blah blah - -H1 | H2 | H3 ------|-----------|----- -C1 | Content 2 | x -C123 | Content 2 | xyz +| Header 1 | Header 2 | Header 3 | +|------------------|-----------|----------------------------------------------| +| Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image! | +| Content 1 longer | Content 2 | blah | +| Content | Content 2 | blah | +| t | Content 2 | blah blah blah | + +| H1 | H2 | H3 | +|------|-----------|-----| +| C1 | Content 2 | x | +| C123 | Content 2 | xyz | some content between the tables -Header 1 | Header 2 | Header 3 -----------|------------------|---------------------------------------------- -Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! -Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! +| Header 1 | Header 2 | Header 3 | +|-----------|------------------|----------------------------------------------| +| Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! | +| Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! | something else entirely -One | Two | Three -------|-----|------- -A | B | C -A | B+C -A+B | C -A+B+C - -One+Two | Three ---------|------- -A | B | C -A | B+C -A+B | C -A+B+C +| One | Two | Three | +|-------|-----|-------| +| A | B | C | +| A | B+C | +| A+B | C | +| A+B+C | + +| One+Two | Three | +|---------|-------| +| A | B | C | +| A | B+C | +| A+B | C | +| A+B+C | From 8c847bf654ba3e479e99dfd7ec68561ff979a97e Mon Sep 17 00:00:00 2001 From: Alireza Savand <591113+Alir3z4@users.noreply.github.com> Date: Wed, 4 Mar 2020 17:57:57 +0300 Subject: [PATCH 458/479] Update ChangeLog.rst --- ChangeLog.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog.rst b/ChangeLog.rst index 85144358..a4246543 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,9 @@ +UNRELEASED +========== +---- + + + 2020.1.16 ========= ---- From 24aac8629f6a50ab262727815107384e0682cf42 Mon Sep 17 00:00:00 2001 From: Alireza Savand <591113+Alir3z4@users.noreply.github.com> Date: Wed, 4 Mar 2020 18:00:25 +0300 Subject: [PATCH 459/479] Update changelog with pull request #318 --- ChangeLog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index a4246543..61a16aeb 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,7 +2,7 @@ UNRELEASED ========== ---- - +* Feature #318: Make padded tables more similar to pandoc's pipe_tables. 2020.1.16 ========= From f6ffd74b49d1662fb062cecb8072a64093519a6a Mon Sep 17 00:00:00 2001 From: Edward Ross Date: Tue, 4 Aug 2020 23:38:02 +1000 Subject: [PATCH 460/479] Tests for emphasis around whitespace Multiple kinds of tags should behave similarly whether they are preceeded by whitespace or not. --- test/emphasis_whitespace.html | 31 +++++++++++++++++++++++++++++++ test/emphasis_whitespace.md | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 test/emphasis_whitespace.html create mode 100644 test/emphasis_whitespace.md diff --git a/test/emphasis_whitespace.html b/test/emphasis_whitespace.html new file mode 100644 index 00000000..e1a2d628 --- /dev/null +++ b/test/emphasis_whitespace.html @@ -0,0 +1,31 @@ +

      ib

      + +

      .ib

      + +

      bi

      + +

      .bi

      + +

      is

      + +

      .is

      + +

      si

      + +

      .si

      + +

      bs

      + +

      .bs

      + +

      sb

      + +

      .sb

      + +

      sbi

      + +

      .sbi

      + +

      bis

      + +

      .bis

      diff --git a/test/emphasis_whitespace.md b/test/emphasis_whitespace.md new file mode 100644 index 00000000..4b9b2e6b --- /dev/null +++ b/test/emphasis_whitespace.md @@ -0,0 +1,32 @@ +_**ib**_ + +. _**ib**_ + +**_bi_** + +. **_bi_** + +_~~is~~_ + +. _~~is~~_ + +_~~si~~_ + +. _~~si~~_ + +**~~bs~~** + +. **~~bs~~** + +~~**sb**~~ + +. ~~**sb**~~ + +~~**_sbi_**~~ + +. ~~**_sbi_**~~ + +**_~~bis~~_** + +. **_~~bis~~_** + From 0641b1508a3c548b280ecc180c924235fc994fc3 Mon Sep 17 00:00:00 2001 From: Edward Ross Date: Tue, 4 Aug 2020 23:39:48 +1000 Subject: [PATCH 461/479] Only insert one space when multiple emphasis Whitespace needs to be inserted when it's not preceeding a type of emphasis. When there are multiple types of emphasis this should only be added once. --- html2text/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/html2text/__init__.py b/html2text/__init__.py index 07a3dab1..fc6b8e89 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -397,6 +397,7 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag in ["em", "i", "u"] and not self.ignore_emphasis: if start and no_preceding_space(self): emphasis = " " + self.emphasis_mark + self.preceding_data = " " else: emphasis = self.emphasis_mark @@ -407,6 +408,7 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag in ["strong", "b"] and not self.ignore_emphasis: if start and no_preceding_space(self): strong = " " + self.strong_mark + self.preceding_data = " " else: strong = self.strong_mark @@ -417,6 +419,7 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag in ["del", "strike", "s"]: if start and no_preceding_space(self): strike = " ~~" + self.preceding_data = " " else: strike = "~~" From 2fea4b74b879d711d330b873d126c11e2e6f9ef0 Mon Sep 17 00:00:00 2001 From: Edward Ross Date: Tue, 4 Aug 2020 23:44:47 +1000 Subject: [PATCH 462/479] Update changelog.rst --- AUTHORS.rst | 1 + ChangeLog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 3ec70c5b..8d31daf7 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -31,6 +31,7 @@ The AUTHORS/Contributors are (and/or have been): * Jacek Kołodziej * Jonathan Vanasco * Jon Dufresne +* Edward Ross Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 61a16aeb..f57edb39 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -2,6 +2,7 @@ UNRELEASED ========== ---- +* Fix #332: Insert at most one space for multiple emphasis * Feature #318: Make padded tables more similar to pandoc's pipe_tables. 2020.1.16 From c18484a08aae0317e6d338b999563c4e8e4053d4 Mon Sep 17 00:00:00 2001 From: Edward Ross Date: Tue, 4 Aug 2020 23:54:43 +1000 Subject: [PATCH 463/479] Append extra whitespace to preceding data when output. When an extra space is output for an emphasis tag then there is effectively an extra space in the preceding data. Append this space instead of clobbering the existing data. --- html2text/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index fc6b8e89..ea08153d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -397,7 +397,7 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag in ["em", "i", "u"] and not self.ignore_emphasis: if start and no_preceding_space(self): emphasis = " " + self.emphasis_mark - self.preceding_data = " " + self.preceding_data += " " else: emphasis = self.emphasis_mark @@ -408,7 +408,7 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag in ["strong", "b"] and not self.ignore_emphasis: if start and no_preceding_space(self): strong = " " + self.strong_mark - self.preceding_data = " " + self.preceding_data += " " else: strong = self.strong_mark @@ -419,7 +419,7 @@ def no_preceding_space(self: HTML2Text) -> bool: if tag in ["del", "strike", "s"]: if start and no_preceding_space(self): strike = " ~~" - self.preceding_data = " " + self.preceding_data += " " else: strike = "~~" From d21a550758edefec577b043d95439fb813225314 Mon Sep 17 00:00:00 2001 From: Edward Ross Date: Wed, 5 Aug 2020 00:06:07 +1000 Subject: [PATCH 464/479] Add path for isort in tox Having no path in isort was leading to errors when running tests. Add isort change in existing code. --- test/test_html2text.py | 3 ++- tox.ini | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_html2text.py b/test/test_html2text.py index 39fa37e3..f724fc8a 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -4,9 +4,10 @@ import subprocess import sys -import html2text import pytest +import html2text + skip = object() diff --git a/tox.ini b/tox.ini index fb4403cf..9f54b3a2 100644 --- a/tox.ini +++ b/tox.ini @@ -33,7 +33,7 @@ skip_install = true [testenv:isort] basepython = python3 commands = - isort --check-only --diff + isort --check-only --diff . deps = isort skip_install = true From f76d71d0f3c2ba463c5cd38c1a58998e302d09ff Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Mon, 31 Aug 2020 18:21:38 -0700 Subject: [PATCH 465/479] Simplify and update lint configurations - flake8 ignores W503 by default - Use isort profile to simplify configuration - Update isort CLI invocation for 5.* series --- setup.cfg | 8 ++------ test/test_html2text.py | 3 ++- tox.ini | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/setup.cfg b/setup.cfg index 3efd0479..22875af7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,15 +40,11 @@ html2text = py.typed [flake8] max_line_length = 88 -ignore = - E203 - W503 +extend-ignore = E203 [isort] combine_as_imports = True -include_trailing_comma = True -line_length = 88 -multi_line_output = 3 +profile = black [mypy] python_version = 3.5 diff --git a/test/test_html2text.py b/test/test_html2text.py index 39fa37e3..f724fc8a 100644 --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -4,9 +4,10 @@ import subprocess import sys -import html2text import pytest +import html2text + skip = object() diff --git a/tox.ini b/tox.ini index fb4403cf..9737bbc8 100644 --- a/tox.ini +++ b/tox.ini @@ -33,9 +33,9 @@ skip_install = true [testenv:isort] basepython = python3 commands = - isort --check-only --diff + isort --check --diff . deps = - isort + isort >= 5.0.1 skip_install = true [testenv:mypy] From c09d1c132a888846f38bae5d46b3d0cc87192cc4 Mon Sep 17 00:00:00 2001 From: mborsetti Date: Sun, 1 Nov 2020 17:00:37 +0800 Subject: [PATCH 466/479] Don't add line breaks inside link names --- AUTHORS.rst | 1 + ChangeLog.rst | 1 + html2text/__init__.py | 3 ++- test/empty-link.md | 4 +--- test/link_titles.html | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 3ec70c5b..d4889ab7 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -31,6 +31,7 @@ The AUTHORS/Contributors are (and/or have been): * Jacek Kołodziej * Jonathan Vanasco * Jon Dufresne +* Mike Borsetti Maintainer: diff --git a/ChangeLog.rst b/ChangeLog.rst index 61a16aeb..f3644c1e 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ UNRELEASED ---- * Feature #318: Make padded tables more similar to pandoc's pipe_tables. +* Fix extra line breaks inside html link text (between '[' and ']') 2020.1.16 ========= diff --git a/html2text/__init__.py b/html2text/__init__.py index 07a3dab1..ec9b61a3 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -348,7 +348,7 @@ def handle_tag( self.p() else: self.soft_br() - elif self.astack and tag == "div": + elif self.astack: pass else: self.p() @@ -484,6 +484,7 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.empty_link = False self.maybe_automatic_link = None if self.inline_links: + self.p_p = 0 title = a.get("title") or "" title = escape_md(title) link_url(self, a["href"], title) diff --git a/test/empty-link.md b/test/empty-link.md index 7a38588f..77bada00 100644 --- a/test/empty-link.md +++ b/test/empty-link.md @@ -2,7 +2,5 @@ This test checks whether empty hyperlinks still appear in the markdown result. -[](http://some.link) - -[](http://some.link) +[](http://some.link) [](http://some.link) diff --git a/test/link_titles.html b/test/link_titles.html index ceba0d52..17b9f538 100644 --- a/test/link_titles.html +++ b/test/link_titles.html @@ -1,3 +1,3 @@ first example
      - second example +

      second example

      From bad60517c46b4e1794a933f3a2a01a29e38e2cdd Mon Sep 17 00:00:00 2001 From: mborsetti Date: Sun, 1 Nov 2020 17:16:44 +0800 Subject: [PATCH 467/479] Support Python 3.9 --- .travis.yml | 2 ++ ChangeLog.rst | 1 + setup.cfg | 1 + 3 files changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 10b062eb..26c3b4c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,8 @@ matrix: env: TOXENV=py37 - python: 3.8 env: TOXENV=py38 + - python: 3.9 + env: TOXENV=py39 - python: "pypy3" env: TOXENV=pypy3 diff --git a/ChangeLog.rst b/ChangeLog.rst index 61a16aeb..1a1bfb52 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -3,6 +3,7 @@ UNRELEASED ---- * Feature #318: Make padded tables more similar to pandoc's pipe_tables. +* Add support for Python 3.9. 2020.1.16 ========= diff --git a/setup.cfg b/setup.cfg index 22875af7..6ba62eb7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,6 +21,7 @@ classifiers = Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 Programming Language :: Python :: 3 :: Only Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy From 8bbd2fcba431d26f2984bd8dc235aee6364a5459 Mon Sep 17 00:00:00 2001 From: mborsetti Date: Sun, 15 Nov 2020 10:04:12 +0800 Subject: [PATCH 468/479] Capture instance of tags inside links (found in the wild) --- html2text/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index ec9b61a3..4abee6d7 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -334,7 +334,8 @@ def handle_tag( parent_style = self.tag_stack[-1][2] if hn(tag): - self.p() + if self.outtextlist[-1] != '[': + self.p() if start: self.inheader = True self.o(hn(tag) * "#" + " ") From 27db55f86fc0823e8c3e4f477a0495729d57eefe Mon Sep 17 00:00:00 2001 From: mborsetti Date: Sun, 15 Nov 2020 11:19:01 +0800 Subject: [PATCH 469/479] Correctly handle tags inside href links (as found in the wild) --- html2text/__init__.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 4abee6d7..533af898 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -334,14 +334,28 @@ def handle_tag( parent_style = self.tag_stack[-1][2] if hn(tag): - if self.outtextlist[-1] != '[': - self.p() - if start: - self.inheader = True - self.o(hn(tag) * "#" + " ") + # check if nh is inside of an 'a' tag (incorrect but found in the wild) + if self.astack: + if start: + self.inheader = True + # are inside link name, so only add '#' if it can appear before '[' + if self.outtextlist and self.outtextlist[-1] == '[': + self.outtextlist.pop() + self.space = False + self.o(hn(tag) * "#" + " ") + self.o('[') + else: + self.p_p = 0 # don't break up link name + self.inheader = False + return # prevent redundant emphasis marks on headers else: - self.inheader = False - return # prevent redundant emphasis marks on headers + self.p() + if start: + self.inheader = True + self.o(hn(tag) * "#" + " ") + else: + self.inheader = False + return # prevent redundant emphasis marks on headers if tag in ["p", "div"]: if self.google_doc: From 37504a0565206ff9d5715c81df040a3de25bc69b Mon Sep 17 00:00:00 2001 From: mborsetti Date: Sun, 15 Nov 2020 12:08:03 +0800 Subject: [PATCH 470/479] Replace '' with "" to satisfy black in Travis CI --- html2text/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 533af898..7afe4c07 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -339,11 +339,11 @@ def handle_tag( if start: self.inheader = True # are inside link name, so only add '#' if it can appear before '[' - if self.outtextlist and self.outtextlist[-1] == '[': + if self.outtextlist and self.outtextlist[-1] == "[": self.outtextlist.pop() self.space = False self.o(hn(tag) * "#" + " ") - self.o('[') + self.o("[") else: self.p_p = 0 # don't break up link name self.inheader = False From 8c8f5ecba701b340d9979bba29b70213e698408f Mon Sep 17 00:00:00 2001 From: Ryan Barrett Date: Sun, 6 Dec 2020 08:44:46 -0800 Subject: [PATCH 471/479] indent