From cd277ed493fbd815a7349f4ac70acae11cf88bd0 Mon Sep 17 00:00:00 2001 From: Peter Brittain Date: Mon, 31 Oct 2016 21:48:19 +0000 Subject: [PATCH] Make parser more resilient to bugged files --- pdfminer/pdfparser.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 61eb1dc8..1c5d3530 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -52,6 +52,7 @@ def set_document(self, doc): KEYWORD_R = KWD(b'R') KEYWORD_NULL = KWD(b'null') + KEYWORD_OBJ = KWD(b'obj') KEYWORD_ENDOBJ = KWD(b'endobj') KEYWORD_STREAM = KWD(b'stream') KEYWORD_XREF = KWD(b'xref') @@ -59,10 +60,17 @@ def set_document(self, doc): def do_keyword(self, pos, token): """Handles PDF-related keywords.""" - if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) + elif token is self.KEYWORD_OBJ: + # Handle implicit endobj in some bugged PDFs + if len(self.curstack) > 2: + stack = self.popall() + self.add_results(*stack[0:-2]) + self.push(*stack[-2:]) + self.push((pos, token)) + elif token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4))