From 56bd0c65c48ff0efdd884e731fd0fa5b04c5beef Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 12 Mar 2017 17:19:49 +0700 Subject: [PATCH 01/17] Update the purpose of the fork --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 902a7e1..9742584 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ -go get rsc.io/pdf +# Purpose of the fork -http://godoc.org/rsc.io/pdf +This fork of rsc.io/pdf extends the package API with: + + - Implement the method GetPlainText() from object Page. Use to get plain text content (without format) From 612c19099809c99bd4595ee711560b748bd0ae2a Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 12 Mar 2017 20:33:21 +0700 Subject: [PATCH 02/17] Change the import path --- pdfpasswd/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfpasswd/main.go b/pdfpasswd/main.go index 53c8ef1..57fa88f 100644 --- a/pdfpasswd/main.go +++ b/pdfpasswd/main.go @@ -12,7 +12,7 @@ import ( "log" "os" - "rsc.io/pdf" + "github.com/ledongthuc/pdf" ) var ( From b95967f4ea5d295d627d5767e70b1b4ddbd05f7d Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 12 Mar 2017 21:50:42 +0700 Subject: [PATCH 03/17] Add function to get plain text from Page --- page.go | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/page.go b/page.go index 9c7d688..c7a80ac 100644 --- a/page.go +++ b/page.go @@ -5,6 +5,7 @@ package pdf import ( + "bytes" "fmt" "strings" ) @@ -401,6 +402,61 @@ type gstate struct { CTM matrix } +// GetPlainText returns the page's all text without format. +// - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try. +func (p Page) GetPlainText(seperator string) string { + strm := p.V.Key("Contents") + + var textBuilder bytes.Buffer + showText := func(s string) { + _, err := textBuilder.WriteString(s) + if err != nil { + panic(err) + } + } + + Interpret(strm, func(stk *Stack, op string) { + n := stk.Len() + args := make([]Value, n) + for i := n - 1; i >= 0; i-- { + args[i] = stk.Pop() + } + + switch op { + default: + return + case "T*": // move to start of next line + showText(seperator) + case "\"": // set spacing, move to next line, and show text + if len(args) != 3 { + panic("bad \" operator") + } + fallthrough + case "'": // move to next line and show text + if len(args) != 1 { + panic("bad ' operator") + } + fallthrough + case "Tj": // show text + if len(args) != 1 { + panic("bad Tj operator") + } + showText(args[0].RawString()) + showText(seperator) + case "TJ": // show text, allowing individual glyph positioning + v := args[0] + for i := 0; i < v.Len(); i++ { + x := v.Index(i) + if x.Kind() == String { + showText(x.RawString()) + showText(seperator) + } + } + } + }) + return textBuilder.String() +} + // Content returns the page's content. func (p Page) Content() Content { strm := p.V.Key("Contents") From daace13046a1d74935e3d38767d83943d9f63155 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 15:53:10 +0700 Subject: [PATCH 04/17] Remove comment of rsc --- read.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/read.go b/read.go index eb8b9aa..f3ad3ed 100644 --- a/read.go +++ b/read.go @@ -44,7 +44,7 @@ // the package. Equally important, traversal of other PDF data structures can be implemented // in other packages as needed. // -package pdf // import "rsc.io/pdf" +package pdf // BUG(rsc): The package is incomplete, although it has been used successfully on some // large real-world PDF files. From 0e30ba212a76647ebf25d15ff1e0a50ca5f44971 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 16:58:03 +0700 Subject: [PATCH 05/17] Update --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 9742584..cfa0d9c 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,27 @@ This fork of rsc.io/pdf extends the package API with: - Implement the method GetPlainText() from object Page. Use to get plain text content (without format) + +## How to read all text from PDF: + +I write an example function to read file from PATH and return the content of PDF + + ```golang + func readPdf(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + var textBuilder bytes.Buffer + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + textBuilder.WriteString(p.GetPlainText("\n")) + } + return textBuilder.String(), nil + } + ``` From ffbf376ba4dfa5945fd52fd1dbce9468a6f98342 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 16:58:42 +0700 Subject: [PATCH 06/17] Correct the language of code block example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cfa0d9c..97860b7 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF - ```golang + ```go func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil { From f8f8fe4f600c77e16df2d1121cec055de53192c7 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 16:59:16 +0700 Subject: [PATCH 07/17] Update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 97860b7..7894a4a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF - ```go + ``` func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil { From d6cc51520d9495c45daa3ca4b69af29f67ce53bb Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 17:00:37 +0700 Subject: [PATCH 08/17] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7894a4a..587e6e6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF - ``` +```golang func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil { @@ -26,4 +26,4 @@ I write an example function to read file from PATH and return the content of PDF } return textBuilder.String(), nil } - ``` +``` From f3eb144855fbc8a739170bd4f661c75e8979f5f9 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Tue, 14 Mar 2017 10:46:20 +0700 Subject: [PATCH 09/17] Update README.md --- README.md | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 587e6e6..167133b 100644 --- a/README.md +++ b/README.md @@ -9,21 +9,42 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF ```golang - func readPdf(path string) (string, error) { - r, err := pdf.Open(path) - if err != nil { - return "", err - } - totalPage := r.NumPage() - - var textBuilder bytes.Buffer - for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { - p := r.Page(pageIndex) - if p.V.IsNull() { - continue - } - textBuilder.WriteString(p.GetPlainText("\n")) - } - return textBuilder.String(), nil - } +package main + +import ( + "bytes" + "fmt" + + "github.com/ledongthuc/pdf" +) + +func main() { + content, err := readPdf("test.pdf") // Read local pdf file + if err != nil { + panic(err) + } + fmt.Println(content) + return +} + +func readPdf(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + var textBuilder bytes.Buffer + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + textBuilder.WriteString(p.GetPlainText("\n")) + } + return textBuilder.String(), nil +} ``` + +## Demo +![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) From 66da04eb56a952ee1e98370590103cee2c61b3ff Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Tue, 14 Mar 2017 10:51:04 +0700 Subject: [PATCH 10/17] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 167133b..7916174 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,10 @@ This fork of rsc.io/pdf extends the package API with: ## How to read all text from PDF: -I write an example function to read file from PATH and return the content of PDF +1. Get the library with command `go get -u github.com/ledongthuc/pdf` + + +2. I write an example function to read file from PATH and return the content of PDF ```golang package main From 4ff10c65aed6fff2bf0eda2611c97e990caa6377 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 19 Jun 2017 07:13:38 +0700 Subject: [PATCH 11/17] Add space when get text from Content() Based on pull request of https://github.com/rsc/pdf/pull/8 but never merged. So I need it :( --- page.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/page.go b/page.go index c7a80ac..7e34a8e 100644 --- a/page.go +++ b/page.go @@ -474,17 +474,14 @@ func (p Page) Content() Content { Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM) w0 := g.Tf.Width(int(s[n])) n++ - if ch != ' ' { - f := g.Tf.BaseFont() - if i := strings.Index(f, "+"); i >= 0 { - f = f[i+1:] - } - text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) + + f := g.Tf.BaseFont() + if i := strings.Index(f, "+"); i >= 0 { + f = f[i+1:] } + text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) + tx := w0/1000*g.Tfs + g.Tc - if ch == ' ' { - tx += g.Tw - } tx *= g.Th g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } From 1e8ebfa8c2834dd64e93fd39717a7d8d9edeb897 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 19 Jun 2017 07:28:55 +0700 Subject: [PATCH 12/17] Add readme content to get texts with style --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index 7916174..b30005c 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,35 @@ func readPdf(path string) (string, error) { } ``` +## How to read all text with styles from PDF + +```golang +func readPdf2(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + var lastTextStyle pdf.Text + texts := p.Content().Text + for _, text := range texts { + if isSameSentence(text, lastTextStyle) { + lastTextStyle.S = lastTextStyle.S + text.S + } else { + fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S) + lastTextStyle = text + } + } + } + return "", nil +} +``` + ## Demo ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) From fbd875511ef56a0e84d6a779ad105687e86707c7 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 2 Jul 2017 14:37:26 +0700 Subject: [PATCH 13/17] Update --- page.go | 1 + 1 file changed, 1 insertion(+) diff --git a/page.go b/page.go index 7e34a8e..7bb3d43 100644 --- a/page.go +++ b/page.go @@ -620,6 +620,7 @@ func (p Page) Content() Content { g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } } + showText("\n") case "TL": // set text leading if len(args) != 1 { From 11f580bd1d786f4d02ee0696a966dc2c687b718e Mon Sep 17 00:00:00 2001 From: Rob Archibald Date: Thu, 17 Aug 2017 19:29:14 -0700 Subject: [PATCH 14/17] Add GetPlainText to Reader. Fix Encoder method --- README.md | 29 +++++------ page.go | 152 +++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 123 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index b30005c..76f33e1 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,17 @@ -# Purpose of the fork +# PDF Reader -This fork of rsc.io/pdf extends the package API with: +A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf - - Implement the method GetPlainText() from object Page. Use to get plain text content (without format) +Features + - Get plain text content (without format) + - Get Content (including all font and formatting information) -## How to read all text from PDF: +## Install: -1. Get the library with command `go get -u github.com/ledongthuc/pdf` +`go get -u github.com/ledongthuc/pdf` -2. I write an example function to read file from PATH and return the content of PDF +## Read plain text ```golang package main @@ -35,21 +37,14 @@ func readPdf(path string) (string, error) { if err != nil { return "", err } - totalPage := r.NumPage() - var textBuilder bytes.Buffer - for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { - p := r.Page(pageIndex) - if p.V.IsNull() { - continue - } - textBuilder.WriteString(p.GetPlainText("\n")) - } - return textBuilder.String(), nil + var buf bytes.Buffer + buf.ReadFrom(p.GetPlainText()) + return buf.String(), nil } ``` -## How to read all text with styles from PDF +## Read all text with styles from PDF ```golang func readPdf2(path string) (string, error) { diff --git a/page.go b/page.go index 7bb3d43..e330bc4 100644 --- a/page.go +++ b/page.go @@ -7,6 +7,7 @@ package pdf import ( "bytes" "fmt" + "io" "strings" ) @@ -56,6 +57,24 @@ func (r *Reader) NumPage() int { return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64()) } +// GetPlainText returns all the text in the PDF file +func (r *Reader) GetPlainText() io.Reader { + pages := r.NumPage() + var buf bytes.Buffer + fonts := make(map[string]*Font) + for i := 1; i < pages; i++ { + p := r.Page(i) + for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap + if _, ok := fonts[name]; !ok { + f := p.Font(name) + fonts[name] = &f + } + } + buf.WriteString(p.GetPlainText(fonts)) + } + return &buf +} + func (p Page) findInherited(key string) Value { for v := p.V; !v.IsNull(); v = v.Key("Parent") { if r := v.Key(key); !r.IsNull() { @@ -87,13 +106,14 @@ func (p Page) Fonts() []string { // Font returns the font with the given name associated with the page. func (p Page) Font(name string) Font { - return Font{p.Resources().Key("Font").Key(name)} + return Font{p.Resources().Key("Font").Key(name), nil} } // A Font represent a font in a PDF file. // The methods interpret a Font dictionary stored in V. type Font struct { - V Value + V Value + enc TextEncoding } // BaseFont returns the font's name (BaseFont property). @@ -134,6 +154,13 @@ func (f Font) Width(code int) float64 { // Encoder returns the encoding between font code point sequences and UTF-8. func (f Font) Encoder() TextEncoding { + if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap + f.enc = f.getEncoder() + } + return f.enc +} + +func (f Font) getEncoder() TextEncoding { enc := f.V.Key("Encoding") switch enc.Kind() { case Name: @@ -143,8 +170,7 @@ func (f Font) Encoder() TextEncoding { case "MacRomanEncoding": return &byteEncoder{&macRomanEncoding} case "Identity-H": - // TODO: Should be big-endian UCS-2 decoder - return &nopEncoder{} + return f.charmapEncoding() default: println("unknown encoding", enc.Name()) return &nopEncoder{} @@ -152,14 +178,16 @@ func (f Font) Encoder() TextEncoding { case Dict: return &dictEncoder{enc.Key("Differences")} case Null: - // ok, try ToUnicode + return f.charmapEncoding() default: println("unexpected encoding", enc.String()) return &nopEncoder{} } +} +func (f *Font) charmapEncoding() TextEncoding { toUnicode := f.V.Key("ToUnicode") - if toUnicode.Kind() == Dict { + if toUnicode.Kind() == Stream { m := readCmap(toUnicode) if m == nil { return &nopEncoder{} @@ -228,42 +256,64 @@ func (e *byteEncoder) Decode(raw string) (text string) { return string(r) } +type byteRange struct { + low string + high string +} + +type bfchar struct { + orig string + repl string +} + +type bfrange struct { + lo string + hi string + dst Value +} + type cmap struct { - space [4][][2]string + space [4][]byteRange // codespace range bfrange []bfrange + bfchar []bfchar } func (m *cmap) Decode(raw string) (text string) { var r []rune Parse: for len(raw) > 0 { - for n := 1; n <= 4 && n <= len(raw); n++ { - for _, space := range m.space[n-1] { - if space[0] <= raw[:n] && raw[:n] <= space[1] { + for n := 1; n <= 4 && n <= len(raw); n++ { // number of digits in character replacement (1-4 possible) + for _, space := range m.space[n-1] { // find matching codespace Ranges for number of digits + if space.low <= raw[:n] && raw[:n] <= space.high { // see if value is in range text := raw[:n] raw = raw[n:] - for _, bf := range m.bfrange { - if len(bf.lo) == n && bf.lo <= text && text <= bf.hi { - if bf.dst.Kind() == String { - s := bf.dst.RawString() - if bf.lo != text { + for _, bfchar := range m.bfchar { // check for matching bfchar + if len(bfchar.orig) == n && bfchar.orig == text { + r = append(r, []rune(utf16Decode(bfchar.repl))...) + continue Parse + } + } + for _, bfrange := range m.bfrange { // check for matching bfrange + if len(bfrange.lo) == n && bfrange.lo <= text && text <= bfrange.hi { + if bfrange.dst.Kind() == String { + s := bfrange.dst.RawString() + if bfrange.lo != text { // value isn't at the beginning of the range so scale result b := []byte(s) - b[len(b)-1] += text[len(text)-1] - bf.lo[len(bf.lo)-1] + b[len(b)-1] += text[len(text)-1] - bfrange.lo[len(bfrange.lo)-1] // increment last byte by difference s = string(b) } r = append(r, []rune(utf16Decode(s))...) continue Parse } - if bf.dst.Kind() == Array { - fmt.Printf("array %v\n", bf.dst) + if bfrange.dst.Kind() == Array { + fmt.Printf("array %v\n", bfrange.dst) } else { - fmt.Printf("unknown dst %v\n", bf.dst) + fmt.Printf("unknown dst %v\n", bfrange.dst) } r = append(r, noRune) continue Parse } } - fmt.Printf("no text for %q", text) r = append(r, noRune) continue Parse } @@ -276,12 +326,6 @@ Parse: return string(r) } -type bfrange struct { - lo string - hi string - dst Value -} - func readCmap(toUnicode Value) *cmap { n := -1 var m cmap @@ -292,9 +336,8 @@ func readCmap(toUnicode Value) *cmap { } switch op { case "findresource": - category := stk.Pop() - key := stk.Pop() - fmt.Println("findresource", key, category) + stk.Pop() // category + stk.Pop() // key stk.Push(newDict()) case "begincmap": stk.Push(newDict()) @@ -315,9 +358,19 @@ func readCmap(toUnicode Value) *cmap { ok = false return } - m.space[len(lo)-1] = append(m.space[len(lo)-1], [2]string{lo, hi}) + m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi}) } n = -1 + case "beginbfchar": + n = int(stk.Pop().Int64()) + case "endbfchar": + if n < 0 { + panic("missing beginbfchar") + } + for i := 0; i < n; i++ { + repl, orig := stk.Pop().RawString(), stk.Pop().RawString() + m.bfchar = append(m.bfchar, bfchar{orig, repl}) + } case "beginbfrange": n = int(stk.Pop().Int64()) case "endbfrange": @@ -329,10 +382,9 @@ func readCmap(toUnicode Value) *cmap { m.bfrange = append(m.bfrange, bfrange{srcLo, srcHi, dst}) } case "defineresource": - category := stk.Pop().Name() + stk.Pop().Name() // category value := stk.Pop() - key := stk.Pop().Name() - fmt.Println("defineresource", key, value, category) + stk.Pop().Name() // key stk.Push(value) default: println("interp\t", op) @@ -403,15 +455,26 @@ type gstate struct { } // GetPlainText returns the page's all text without format. -// - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try. -func (p Page) GetPlainText(seperator string) string { +// fonts can be passed in (to improve parsing performance) or left nil +func (p Page) GetPlainText(fonts map[string]*Font) string { strm := p.V.Key("Contents") + var enc TextEncoding = &nopEncoder{} + + if fonts == nil { + fonts = make(map[string]*Font) + for _, font := range p.Fonts() { + f := p.Font(font) + fonts[font] = &f + } + } var textBuilder bytes.Buffer showText := func(s string) { - _, err := textBuilder.WriteString(s) - if err != nil { - panic(err) + for _, ch := range enc.Decode(s) { + _, err := textBuilder.WriteRune(ch) + if err != nil { + panic(err) + } } } @@ -426,7 +489,16 @@ func (p Page) GetPlainText(seperator string) string { default: return case "T*": // move to start of next line - showText(seperator) + showText("\n") + case "Tf": // set text font and size + if len(args) != 2 { + panic("bad TL") + } + if font, ok := fonts[args[0].Name()]; ok { + enc = font.Encoder() + } else { + enc = &nopEncoder{} + } case "\"": // set spacing, move to next line, and show text if len(args) != 3 { panic("bad \" operator") @@ -442,14 +514,12 @@ func (p Page) GetPlainText(seperator string) string { panic("bad Tj operator") } showText(args[0].RawString()) - showText(seperator) case "TJ": // show text, allowing individual glyph positioning v := args[0] for i := 0; i < v.Len(); i++ { x := v.Index(i) if x.Kind() == String { showText(x.RawString()) - showText(seperator) } } } From 4e83a7495507eca29a8fb782f1b5dabcfd66eaef Mon Sep 17 00:00:00 2001 From: Ivin Polo Sony Date: Wed, 30 Aug 2017 14:06:18 +0100 Subject: [PATCH 15/17] Update README.md p variable in undefined in the function readPdf --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76f33e1..fc2078f 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ func readPdf(path string) (string, error) { } var buf bytes.Buffer - buf.ReadFrom(p.GetPlainText()) + buf.ReadFrom(r.GetPlainText()) return buf.String(), nil } ``` From 48f0c0bb4aeb6c118f470f6624cbb7b708e80289 Mon Sep 17 00:00:00 2001 From: Rik Vanmechelen Date: Tue, 30 Jan 2018 13:37:34 -0500 Subject: [PATCH 16/17] page index starts at 1 --- page.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/page.go b/page.go index e330bc4..9f7cd0e 100644 --- a/page.go +++ b/page.go @@ -62,7 +62,7 @@ func (r *Reader) GetPlainText() io.Reader { pages := r.NumPage() var buf bytes.Buffer fonts := make(map[string]*Font) - for i := 1; i < pages; i++ { + for i := 1; i <= pages; i++ { p := r.Page(i) for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap if _, ok := fonts[name]; !ok { From 2deaee226449b1ea6447410a5351e1f6001c0e44 Mon Sep 17 00:00:00 2001 From: Peter Longyear Date: Mon, 26 Mar 2018 14:54:18 -0400 Subject: [PATCH 17/17] Recover from panics when getting plain text --- page.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/page.go b/page.go index 9f7cd0e..29de1d6 100644 --- a/page.go +++ b/page.go @@ -6,6 +6,7 @@ package pdf import ( "bytes" + "errors" "fmt" "io" "strings" @@ -58,7 +59,7 @@ func (r *Reader) NumPage() int { } // GetPlainText returns all the text in the PDF file -func (r *Reader) GetPlainText() io.Reader { +func (r *Reader) GetPlainText() (reader io.Reader, err error) { pages := r.NumPage() var buf bytes.Buffer fonts := make(map[string]*Font) @@ -70,9 +71,13 @@ func (r *Reader) GetPlainText() io.Reader { fonts[name] = &f } } - buf.WriteString(p.GetPlainText(fonts)) + text, err := p.GetPlainText(fonts) + if err != nil { + return &bytes.Buffer{}, err + } + buf.WriteString(text) } - return &buf + return &buf, nil } func (p Page) findInherited(key string) Value { @@ -456,7 +461,14 @@ type gstate struct { // GetPlainText returns the page's all text without format. // fonts can be passed in (to improve parsing performance) or left nil -func (p Page) GetPlainText(fonts map[string]*Font) string { +func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) { + defer func() { + if r := recover(); r != nil { + result = "" + err = errors.New(fmt.Sprint(r)) + } + }() + strm := p.V.Key("Contents") var enc TextEncoding = &nopEncoder{} @@ -524,7 +536,7 @@ func (p Page) GetPlainText(fonts map[string]*Font) string { } } }) - return textBuilder.String() + return textBuilder.String(), nil } // Content returns the page's content.