Source distribution:
-
-http://pypi.python.org/pypi/pdfminer/
+
+http://pypi.python.org/pypi/pdfminer_six/
github:
-
-https://github.com/euske/pdfminer/
+
+https://github.com/goulu/pdfminer/
setup.py to install:
-C
@@ -373,82 +370,10 @@ PDFDocument.initialize() method is removed and no longer needed.
- A password is given as an argument of a PDFDocument constructor.
- PDFDocument class is moved to pdfdocument.py.
- PDFDocument class now takes a PDFParser object as an argument.
- PDFDocument.set_parser() and PDFParser.set_document() is removed.
- PDFPage class is moved to pdfpage.py.
- process_pdf function is implemented as PDFPage.get_pages.
-LTText.get_text() is added.
-Sorry, an error has occurred: %s' % q(repr(e)))
self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc()))
finally:
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index 5eb24bfd..3111c5c8 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -1,116 +1,127 @@
#!/usr/bin/env python
+
+"""
+Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
+"""
import sys
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.pdfdevice import PDFDevice, TagExtractor
-from pdfminer.pdfpage import PDFPage
-from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
-from pdfminer.cmapdb import CMapDB
-from pdfminer.layout import LAParams
+import logging
+import six
+import pdfminer.settings
+pdfminer.settings.STRICT = False
+import pdfminer.high_level
+import pdfminer.layout
from pdfminer.image import ImageWriter
-# main
-def main(argv):
- import getopt
- def usage():
- print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
- ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
- ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
- ' [-t text|html|xml|tag] [-c codec] [-s scale]'
- ' file ...' % argv[0])
- return 100
- try:
- (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
- except getopt.GetoptError:
- return usage()
- if not args: return usage()
- # debug option
- debug = 0
- # input option
- password = ''
- pagenos = set()
- maxpages = 0
- # output option
- outfile = None
- outtype = None
+
+def extract_text(files=[], outfile='-',
+ _py2_no_more_posargs=None, # Bloody Python2 needs a shim
+ no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
+ word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
+ output_type='text', codec='utf-8', strip_control=False,
+ maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
+ layoutmode='normal', output_dir=None, debug=False,
+ disable_caching=False, **other):
+ if _py2_no_more_posargs is not None:
+ raise ValueError("Too many positional arguments passed.")
+ if not files:
+ raise ValueError("Must provide files to work upon!")
+
+ # If any LAParams group arguments were passed, create an LAParams object and
+ # populate with given args. Otherwise, set it to None.
+ if not no_laparams:
+ laparams = pdfminer.layout.LAParams()
+ for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
+ paramv = locals().get(param, None)
+ if paramv is not None:
+ setattr(laparams, param, paramv)
+ else:
+ laparams = None
+
imagewriter = None
- rotation = 0
- stripcontrol = False
- layoutmode = 'normal'
- codec = 'utf-8'
- pageno = 1
- scale = 1
- caching = True
- showpageno = True
- laparams = LAParams()
- for (k, v) in opts:
- if k == '-d': debug += 1
- elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
- elif k == '-m': maxpages = int(v)
- elif k == '-P': password = v
- elif k == '-o': outfile = v
- elif k == '-C': caching = False
- elif k == '-n': laparams = None
- elif k == '-A': laparams.all_texts = True
- elif k == '-V': laparams.detect_vertical = True
- elif k == '-M': laparams.char_margin = float(v)
- elif k == '-L': laparams.line_margin = float(v)
- elif k == '-W': laparams.word_margin = float(v)
- elif k == '-F': laparams.boxes_flow = float(v)
- elif k == '-Y': layoutmode = v
- elif k == '-O': imagewriter = ImageWriter(v)
- elif k == '-R': rotation = int(v)
- elif k == '-S': stripcontrol = True
- elif k == '-t': outtype = v
- elif k == '-c': codec = v
- elif k == '-s': scale = float(v)
- #
- PDFDocument.debug = debug
- PDFParser.debug = debug
- CMapDB.debug = debug
- PDFPageInterpreter.debug = debug
- #
- rsrcmgr = PDFResourceManager(caching=caching)
- if not outtype:
- outtype = 'text'
- if outfile:
- if outfile.endswith('.htm') or outfile.endswith('.html'):
- outtype = 'html'
- elif outfile.endswith('.xml'):
- outtype = 'xml'
- elif outfile.endswith('.tag'):
- outtype = 'tag'
- if outfile:
- outfp = file(outfile, 'w')
+ if output_dir:
+ imagewriter = ImageWriter(output_dir)
+
+ if output_type == "text" and outfile != "-":
+ for override, alttype in ( (".htm", "html"),
+ (".html", "html"),
+ (".xml", "xml"),
+ (".tag", "tag") ):
+ if outfile.endswith(override):
+ output_type = alttype
+
+ if outfile == "-":
+ outfp = sys.stdout
+ if outfp.encoding is not None:
+ codec = 'utf-8'
else:
+ outfp = open(outfile, "wb")
+
+
+ for fname in files:
+ with open(fname, "rb") as fp:
+ pdfminer.high_level.extract_text_to_fp(fp, **locals())
+ return outfp
+
+# main
+def main(args=None):
+ import argparse
+ P = argparse.ArgumentParser(description=__doc__)
+ P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
+ P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
+ P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
+ P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
+ P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
+ P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
+ P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
+ P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
+ P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
+ P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
+ P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
+ P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
+ P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
+ P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
+ P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
+ P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
+ P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
+ P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
+ P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
+ P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
+ P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
+ P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
+ A = P.parse_args(args=args)
+
+ if A.page_numbers:
+ A.page_numbers = set([x-1 for x in A.page_numbers])
+ if A.pagenos:
+ A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
+
+ imagewriter = None
+ if A.output_dir:
+ imagewriter = ImageWriter(A.output_dir)
+
+ if six.PY2 and sys.stdin.encoding:
+ A.password = A.password.decode(sys.stdin.encoding)
+
+ if A.output_type == "text" and A.outfile != "-":
+ for override, alttype in ( (".htm", "html"),
+ (".html", "html"),
+ (".xml", "xml" ),
+ (".tag", "tag" ) ):
+ if A.outfile.endswith(override):
+ A.output_type = alttype
+
+ if A.outfile == "-":
outfp = sys.stdout
- if outtype == 'text':
- device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
- imagewriter=imagewriter)
- elif outtype == 'xml':
- device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
- imagewriter=imagewriter,
- stripcontrol=stripcontrol)
- elif outtype == 'html':
- device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
- layoutmode=layoutmode, laparams=laparams,
- imagewriter=imagewriter, debug=debug)
- elif outtype == 'tag':
- device = TagExtractor(rsrcmgr, outfp, codec=codec)
+ if outfp.encoding is not None:
+ # Why ignore outfp.encoding? :-/ stupid cathal?
+ A.codec = 'utf-8'
else:
- return usage()
- for fname in args:
- fp = file(fname, 'rb')
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- for page in PDFPage.get_pages(fp, pagenos,
- maxpages=maxpages, password=password,
- caching=caching, check_extractable=True):
- page.rotate = (page.rotate+rotation) % 360
- interpreter.process_page(page)
- fp.close()
- device.close()
+ outfp = open(A.outfile, "wb")
+
+ ## Test Code
+ outfp = extract_text(**vars(A))
outfp.close()
- return
+ return 0
+
-if __name__ == '__main__': sys.exit(main(sys.argv))
+if __name__ == '__main__': sys.exit(main())
diff --git a/tools/pdf2txt.spec b/tools/pdf2txt.spec
new file mode 100644
index 00000000..8baeb77f
--- /dev/null
+++ b/tools/pdf2txt.spec
@@ -0,0 +1,30 @@
+# -*- mode: python -*-
+
+block_cipher = None
+
+
+a = Analysis(['pdf2txt.py'],
+ pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
+ binaries=[],
+ datas=[],
+ hiddenimports=[],
+ hookspath=[],
+ runtime_hooks=[],
+ excludes=['django','matplotlib','PIL','numpy','qt5'],
+ win_no_prefer_redirects=False,
+ win_private_assemblies=False,
+ cipher=block_cipher)
+
+pyz = PYZ(a.pure, a.zipped_data,
+ cipher=block_cipher)
+exe = EXE(pyz,
+ a.scripts,
+ a.binaries,
+ a.zipfiles,
+ a.datas,
+ name='pdf2txt',
+ debug=False,
+ strip=False,
+ upx=True,
+ runtime_tmpdir=None,
+ console=True )
diff --git a/tools/pdfdiff.py b/tools/pdfdiff.py
new file mode 100644
index 00000000..b01e2f4e
--- /dev/null
+++ b/tools/pdfdiff.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+"""
+compares rwo pdf files.
+"""
+import sys
+import logging
+import six
+import pdfminer.settings
+pdfminer.settings.STRICT = False
+import pdfminer.high_level
+import pdfminer.layout
+
+def compare(file1,file2,**args):
+ if args.get('_py2_no_more_posargs',None) is not None:
+ raise ValueError("Too many positional arguments passed.")
+
+
+ # If any LAParams group arguments were passed, create an LAParams object and
+ # populate with given args. Otherwise, set it to None.
+ if args.get('laparams',None) is None:
+ laparams = pdfminer.layout.LAParams()
+ for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
+ paramv = args.get(param, None)
+ if paramv is not None:
+ laparams[param]=paramv
+ args['laparams']=laparams
+
+ s1=six.StringIO()
+ with open(file1, "rb") as fp:
+ pdfminer.high_level.extract_text_to_fp(fp,s1, **args)
+
+ s2=six.StringIO()
+ with open(file2, "rb") as fp:
+ pdfminer.high_level.extract_text_to_fp(fp,s2, **args)
+
+ import difflib
+ s1.seek(0)
+ s2.seek(0)
+ s1,s2=s1.readlines(), s2.readlines()
+
+ import os.path
+ try:
+ extension = os.path.splitext(args['outfile'])[1][1:4]
+ if extension.lower()=='htm':
+ return difflib.HtmlDiff().make_file(s1,s2)
+ except KeyError:
+ pass
+ return difflib.unified_diff(s1,s2,n=args['context_lines'])
+
+
+# main
+def main(args=None):
+ import argparse
+ P = argparse.ArgumentParser(description=__doc__)
+ P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
+ P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
+ P.add_argument("-o", "--outfile", type=str, default="-",
+ help="Output file (default/'-' is stdout) \
+ if .htm or .html, create an HTML table (or a complete HTML file containing the table) \
+ showing a side by side, line by line comparison of text with inter-line \
+ and intra-line change highlights. \
+ The table can be generated in either full or contextual difference mode."
+ )
+ P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown")
+ P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
+
+ # params for pdf2txt
+ P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
+ P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
+ P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
+ P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")
+ P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)")
+ P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
+ P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
+ P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
+ P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
+ P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
+ P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
+ P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
+ P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
+ P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
+ P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
+ P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
+ P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
+ P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
+ P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
+
+
+ A = P.parse_args(args=args)
+
+ if A.page_numbers:
+ A.page_numbers = set([x-1 for x in A.page_numbers])
+ if A.pagenos:
+ A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
+
+ if six.PY2 and sys.stdin.encoding:
+ A.password = A.password.decode(sys.stdin.encoding)
+
+ if A.output_type == "text" and A.outfile != "-":
+ for override, alttype in ( (".htm", "html"),
+ (".html", "html"),
+ (".xml", "xml" ),
+ (".tag", "tag" ) ):
+ if A.outfile.endswith(override):
+ A.output_type = alttype
+
+ if A.outfile == "-":
+ outfp = sys.stdout
+ else:
+ outfp = open(A.outfile, "w", encoding='utf-8')
+ outfp.writelines(compare(**vars(A)))
+ outfp.close()
+ return 0
+
+
+if __name__ == '__main__': sys.exit(main())
diff --git a/tools/pdfdiff.spec b/tools/pdfdiff.spec
new file mode 100644
index 00000000..e90a37f5
--- /dev/null
+++ b/tools/pdfdiff.spec
@@ -0,0 +1,29 @@
+# -*- mode: python -*-
+
+block_cipher = None
+
+
+a = Analysis(['pdfdiff.py'],
+ pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
+ binaries=[],
+ datas=[],
+ hiddenimports=[],
+ hookspath=[],
+ runtime_hooks=[],
+ excludes=['django','matplotlib','PIL','numpy','qt5'],
+ win_no_prefer_redirects=False,
+ win_private_assemblies=False,
+ cipher=block_cipher)
+pyz = PYZ(a.pure, a.zipped_data,
+ cipher=block_cipher)
+exe = EXE(pyz,
+ a.scripts,
+ a.binaries,
+ a.zipfiles,
+ a.datas,
+ name='pdfdiff',
+ debug=False,
+ strip=False,
+ upx=True,
+ runtime_tmpdir=None,
+ console=True )
diff --git a/tools/pdfstats.py b/tools/pdfstats.py
new file mode 100755
index 00000000..f3ecbbe7
--- /dev/null
+++ b/tools/pdfstats.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+# Exercise pdfminer, looking deeply into a PDF document, print some stats to stdout
+# Usage: pdfstats.py