From 008f189ad2b88cb733121e9809815a69eed4320e Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Sun, 5 Jun 2022 17:20:05 +0200 Subject: [PATCH 1/6] fix reading past 'endstream' token when parsing use bytes.Buffer instead of string concatenation in readToken closes #53 --- reader.go | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/reader.go b/reader.go index 6d70d4e..bda5923 100644 --- a/reader.go +++ b/reader.go @@ -125,8 +125,8 @@ func (this *PdfReader) skipComments(r *bufio.Reader) error { return nil } -// Advance reader so that whitespace is ignored -func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { +// Advance reader so that skipBytes are ignored +func (this *PdfReader) skip(r *bufio.Reader, skipBytes []byte) error { var err error var b byte @@ -139,7 +139,7 @@ func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { return errors.Wrap(err, "Failed to read byte") } - if b == ' ' || b == '\n' || b == '\r' || b == '\t' { + if bytes.IndexByte(skipBytes, b) > -1 { continue } else { r.UnreadByte() @@ -154,6 +154,7 @@ func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { var err error + var buffer bytes.Buffer // If there is a token available on the stack, pop it out and return it. if len(this.stack) > 0 { var popped string @@ -161,7 +162,8 @@ func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { return popped, nil } - err = this.skipWhitespace(r) + whitespace := getWhitespaceBytes() + err = this.skip(r, whitespace) if err != nil { return "", errors.Wrap(err, "Failed to skip whitespace") } @@ -201,9 +203,7 @@ func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { return this.readToken(r) default: - // FIXME this may not be performant to create new strings for each byte - // Is it probably better to create a buffer and then convert to a string at the end. - str := string(b) + buffer.WriteByte(b) loop: for { @@ -216,10 +216,10 @@ func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { r.UnreadByte() break loop default: - str += string(b) + buffer.WriteByte(b) } } - return str, nil + return buffer.String(), nil } return "", nil @@ -668,7 +668,8 @@ func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { if token == "stream" { result.Type = PDF_TYPE_STREAM - err = this.skipWhitespace(r) + // we just want to skip until after first CRLF + err = this.skip(r, getWhitespaceBytes()) if err != nil { return nil, errors.Wrap(err, "Failed to skip whitespace") } @@ -808,6 +809,7 @@ func (this *PdfReader) findXref() error { func (this *PdfReader) readXref() error { var err error + whitespace := getWhitespaceBytes() // Create new bufio.Reader r := bufio.NewReader(this.f) @@ -908,7 +910,7 @@ func (this *PdfReader) readXref() error { startObject := index[0] - err = this.skipWhitespace(r) + err = this.skip(r, whitespace) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } @@ -939,7 +941,7 @@ func (this *PdfReader) readXref() error { return errors.New("Expected next token to be: stream, got: " + t) } - err = this.skipWhitespace(r) + err = this.skip(r, whitespace) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } @@ -1630,3 +1632,11 @@ func (this *PdfReader) read() error { return nil } + +func getWhitespaceBytes() []byte { + return []byte{0x20, 0x0A, 0x0C, 0x0D, 0x09, 0x00} +} + +func getNewLineBytes() []byte { + return []byte{'\r', '\n'} +} From 7325470319b6165a44464672cbfb42984b2692f8 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Sun, 5 Jun 2022 17:25:05 +0200 Subject: [PATCH 2/6] skip up to first CRLF when parsing pdf stream --- reader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reader.go b/reader.go index bda5923..d4234b8 100644 --- a/reader.go +++ b/reader.go @@ -669,7 +669,7 @@ func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { result.Type = PDF_TYPE_STREAM // we just want to skip until after first CRLF - err = this.skip(r, getWhitespaceBytes()) + err = this.skip(r, getNewLineBytes()) if err != nil { return nil, errors.Wrap(err, "Failed to skip whitespace") } From e04c52743d901b1dd8ef2eb2bb5d5f682adce5ab Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Sun, 5 Jun 2022 18:31:24 +0200 Subject: [PATCH 3/6] simplify code --- reader.go | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/reader.go b/reader.go index d4234b8..1a83425 100644 --- a/reader.go +++ b/reader.go @@ -125,8 +125,8 @@ func (this *PdfReader) skipComments(r *bufio.Reader) error { return nil } -// Advance reader so that skipBytes are ignored -func (this *PdfReader) skip(r *bufio.Reader, skipBytes []byte) error { +// Advance reader so that whitespace is ignored +func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { var err error var b byte @@ -139,7 +139,7 @@ func (this *PdfReader) skip(r *bufio.Reader, skipBytes []byte) error { return errors.Wrap(err, "Failed to read byte") } - if bytes.IndexByte(skipBytes, b) > -1 { + if b == 0x20 || b == 0x0A || b == 0x0C || b == 0x0D || b == 0x09 || b == 0x00 { continue } else { r.UnreadByte() @@ -162,8 +162,7 @@ func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { return popped, nil } - whitespace := getWhitespaceBytes() - err = this.skip(r, whitespace) + err = this.skipWhitespace(r) if err != nil { return "", errors.Wrap(err, "Failed to skip whitespace") } @@ -668,10 +667,13 @@ func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { if token == "stream" { result.Type = PDF_TYPE_STREAM - // we just want to skip until after first CRLF - err = this.skip(r, getNewLineBytes()) + var leadingBytes = make([]byte, 2) + _, err = r.Read(leadingBytes) if err != nil { - return nil, errors.Wrap(err, "Failed to skip whitespace") + return nil, errors.Wrap(err, "Failed to skip leading CRLF") + } + if leadingBytes[0] != '\r' || leadingBytes[1] != '\n' { + return nil, errors.Wrap(err, "Missing leading CRLF at stream start") } // Get stream length dictionary @@ -809,7 +811,6 @@ func (this *PdfReader) findXref() error { func (this *PdfReader) readXref() error { var err error - whitespace := getWhitespaceBytes() // Create new bufio.Reader r := bufio.NewReader(this.f) @@ -910,7 +911,7 @@ func (this *PdfReader) readXref() error { startObject := index[0] - err = this.skip(r, whitespace) + err = this.skipWhitespace(r) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } @@ -941,7 +942,7 @@ func (this *PdfReader) readXref() error { return errors.New("Expected next token to be: stream, got: " + t) } - err = this.skip(r, whitespace) + err = this.skipWhitespace(r) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } @@ -1632,11 +1633,3 @@ func (this *PdfReader) read() error { return nil } - -func getWhitespaceBytes() []byte { - return []byte{0x20, 0x0A, 0x0C, 0x0D, 0x09, 0x00} -} - -func getNewLineBytes() []byte { - return []byte{'\r', '\n'} -} From b9b893141fe4c255aff65e661267855f08fcb2e2 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Sun, 5 Jun 2022 19:07:46 +0200 Subject: [PATCH 4/6] avoid skipping non-leading skip bytes --- reader.go | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/reader.go b/reader.go index 1a83425..ddd63f1 100644 --- a/reader.go +++ b/reader.go @@ -125,8 +125,8 @@ func (this *PdfReader) skipComments(r *bufio.Reader) error { return nil } -// Advance reader so that whitespace is ignored -func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { +// Advance reader so that skipBytes are ignored +func (this *PdfReader) skip(r *bufio.Reader, skipBytes []byte) error { var err error var b byte @@ -139,12 +139,20 @@ func (this *PdfReader) skipWhitespace(r *bufio.Reader) error { return errors.Wrap(err, "Failed to read byte") } - if b == 0x20 || b == 0x0A || b == 0x0C || b == 0x0D || b == 0x09 || b == 0x00 { + skipFound := false + for _, skipByte := range skipBytes { + if skipByte == b { + skipFound = true + break + } + } + + if skipFound { continue - } else { - r.UnreadByte() - break } + + r.UnreadByte() + break } return nil @@ -162,7 +170,8 @@ func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { return popped, nil } - err = this.skipWhitespace(r) + whitespace := getWhitespaceBytes() + err = this.skip(r, whitespace) if err != nil { return "", errors.Wrap(err, "Failed to skip whitespace") } @@ -667,13 +676,10 @@ func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { if token == "stream" { result.Type = PDF_TYPE_STREAM - var leadingBytes = make([]byte, 2) - _, err = r.Read(leadingBytes) + // we just want to skip until after first CRLF + err = this.skip(r, getNewLineBytes()) if err != nil { - return nil, errors.Wrap(err, "Failed to skip leading CRLF") - } - if leadingBytes[0] != '\r' || leadingBytes[1] != '\n' { - return nil, errors.Wrap(err, "Missing leading CRLF at stream start") + return nil, errors.Wrap(err, "Failed to skip whitespace") } // Get stream length dictionary @@ -811,6 +817,7 @@ func (this *PdfReader) findXref() error { func (this *PdfReader) readXref() error { var err error + whitespace := getWhitespaceBytes() // Create new bufio.Reader r := bufio.NewReader(this.f) @@ -911,7 +918,7 @@ func (this *PdfReader) readXref() error { startObject := index[0] - err = this.skipWhitespace(r) + err = this.skip(r, whitespace) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } @@ -942,7 +949,7 @@ func (this *PdfReader) readXref() error { return errors.New("Expected next token to be: stream, got: " + t) } - err = this.skipWhitespace(r) + err = this.skip(r, whitespace) if err != nil { return errors.Wrap(err, "Failed to skip whitespace") } @@ -1633,3 +1640,11 @@ func (this *PdfReader) read() error { return nil } + +func getWhitespaceBytes() []byte { + return []byte{0x20, 0x0A, 0x0C, 0x0D, 0x09, 0x00} +} + +func getNewLineBytes() []byte { + return []byte{'\r', '\n'} +} From 52677f0152545243835785652ff5c661dee99d60 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Wed, 15 Jun 2022 07:37:50 +0200 Subject: [PATCH 5/6] use more idiomatic function names --- reader.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/reader.go b/reader.go index ddd63f1..efedbc5 100644 --- a/reader.go +++ b/reader.go @@ -170,8 +170,7 @@ func (this *PdfReader) readToken(r *bufio.Reader) (string, error) { return popped, nil } - whitespace := getWhitespaceBytes() - err = this.skip(r, whitespace) + err = this.skip(r, whitespaceBytes()) if err != nil { return "", errors.Wrap(err, "Failed to skip whitespace") } @@ -677,7 +676,7 @@ func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { result.Type = PDF_TYPE_STREAM // we just want to skip until after first CRLF - err = this.skip(r, getNewLineBytes()) + err = this.skip(r, newLineBytes()) if err != nil { return nil, errors.Wrap(err, "Failed to skip whitespace") } @@ -817,7 +816,7 @@ func (this *PdfReader) findXref() error { func (this *PdfReader) readXref() error { var err error - whitespace := getWhitespaceBytes() + whitespace := whitespaceBytes() // Create new bufio.Reader r := bufio.NewReader(this.f) @@ -1641,10 +1640,10 @@ func (this *PdfReader) read() error { return nil } -func getWhitespaceBytes() []byte { +func whitespaceBytes() []byte { return []byte{0x20, 0x0A, 0x0C, 0x0D, 0x09, 0x00} } -func getNewLineBytes() []byte { +func newLineBytes() []byte { return []byte{'\r', '\n'} } From 71a07e1b98469a237a0a27a6137f816a76d347a7 Mon Sep 17 00:00:00 2001 From: Florent Heyworth Date: Wed, 15 Jun 2022 07:39:07 +0200 Subject: [PATCH 6/6] consistent naming --- reader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reader.go b/reader.go index efedbc5..ad60d4f 100644 --- a/reader.go +++ b/reader.go @@ -676,7 +676,7 @@ func (this *PdfReader) resolveObject(objSpec *PdfValue) (*PdfValue, error) { result.Type = PDF_TYPE_STREAM // we just want to skip until after first CRLF - err = this.skip(r, newLineBytes()) + err = this.skip(r, newlineBytes()) if err != nil { return nil, errors.Wrap(err, "Failed to skip whitespace") } @@ -1644,6 +1644,6 @@ func whitespaceBytes() []byte { return []byte{0x20, 0x0A, 0x0C, 0x0D, 0x09, 0x00} } -func newLineBytes() []byte { +func newlineBytes() []byte { return []byte{'\r', '\n'} }