diff --git a/README.md b/README.md index 902a7e1..fc2078f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,78 @@ -go get rsc.io/pdf +# PDF Reader -http://godoc.org/rsc.io/pdf +A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf + +Features + - Get plain text content (without format) + - Get Content (including all font and formatting information) + +## Install: + +`go get -u github.com/ledongthuc/pdf` + + +## Read plain text + +```golang +package main + +import ( + "bytes" + "fmt" + + "github.com/ledongthuc/pdf" +) + +func main() { + content, err := readPdf("test.pdf") // Read local pdf file + if err != nil { + panic(err) + } + fmt.Println(content) + return +} + +func readPdf(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + + var buf bytes.Buffer + buf.ReadFrom(r.GetPlainText()) + return buf.String(), nil +} +``` + +## Read all text with styles from PDF + +```golang +func readPdf2(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + var lastTextStyle pdf.Text + texts := p.Content().Text + for _, text := range texts { + if isSameSentence(text, lastTextStyle) { + lastTextStyle.S = lastTextStyle.S + text.S + } else { + fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S) + lastTextStyle = text + } + } + } + return "", nil +} +``` + +## Demo +![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) diff --git a/page.go b/page.go index 9c7d688..29de1d6 100644 --- a/page.go +++ b/page.go @@ -5,7 +5,10 @@ package pdf import ( + "bytes" + "errors" "fmt" + "io" "strings" ) @@ -55,6 +58,28 @@ func (r *Reader) NumPage() int { return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64()) } +// GetPlainText returns all the text in the PDF file +func (r *Reader) GetPlainText() (reader io.Reader, err error) { + pages := r.NumPage() + var buf bytes.Buffer + fonts := make(map[string]*Font) + for i := 1; i <= pages; i++ { + p := r.Page(i) + for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap + if _, ok := fonts[name]; !ok { + f := p.Font(name) + fonts[name] = &f + } + } + text, err := p.GetPlainText(fonts) + if err != nil { + return &bytes.Buffer{}, err + } + buf.WriteString(text) + } + return &buf, nil +} + func (p Page) findInherited(key string) Value { for v := p.V; !v.IsNull(); v = v.Key("Parent") { if r := v.Key(key); !r.IsNull() { @@ -86,13 +111,14 @@ func (p Page) Fonts() []string { // Font returns the font with the given name associated with the page. func (p Page) Font(name string) Font { - return Font{p.Resources().Key("Font").Key(name)} + return Font{p.Resources().Key("Font").Key(name), nil} } // A Font represent a font in a PDF file. // The methods interpret a Font dictionary stored in V. type Font struct { - V Value + V Value + enc TextEncoding } // BaseFont returns the font's name (BaseFont property). @@ -133,6 +159,13 @@ func (f Font) Width(code int) float64 { // Encoder returns the encoding between font code point sequences and UTF-8. func (f Font) Encoder() TextEncoding { + if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap + f.enc = f.getEncoder() + } + return f.enc +} + +func (f Font) getEncoder() TextEncoding { enc := f.V.Key("Encoding") switch enc.Kind() { case Name: @@ -142,8 +175,7 @@ func (f Font) Encoder() TextEncoding { case "MacRomanEncoding": return &byteEncoder{&macRomanEncoding} case "Identity-H": - // TODO: Should be big-endian UCS-2 decoder - return &nopEncoder{} + return f.charmapEncoding() default: println("unknown encoding", enc.Name()) return &nopEncoder{} @@ -151,14 +183,16 @@ func (f Font) Encoder() TextEncoding { case Dict: return &dictEncoder{enc.Key("Differences")} case Null: - // ok, try ToUnicode + return f.charmapEncoding() default: println("unexpected encoding", enc.String()) return &nopEncoder{} } +} +func (f *Font) charmapEncoding() TextEncoding { toUnicode := f.V.Key("ToUnicode") - if toUnicode.Kind() == Dict { + if toUnicode.Kind() == Stream { m := readCmap(toUnicode) if m == nil { return &nopEncoder{} @@ -227,42 +261,64 @@ func (e *byteEncoder) Decode(raw string) (text string) { return string(r) } +type byteRange struct { + low string + high string +} + +type bfchar struct { + orig string + repl string +} + +type bfrange struct { + lo string + hi string + dst Value +} + type cmap struct { - space [4][][2]string + space [4][]byteRange // codespace range bfrange []bfrange + bfchar []bfchar } func (m *cmap) Decode(raw string) (text string) { var r []rune Parse: for len(raw) > 0 { - for n := 1; n <= 4 && n <= len(raw); n++ { - for _, space := range m.space[n-1] { - if space[0] <= raw[:n] && raw[:n] <= space[1] { + for n := 1; n <= 4 && n <= len(raw); n++ { // number of digits in character replacement (1-4 possible) + for _, space := range m.space[n-1] { // find matching codespace Ranges for number of digits + if space.low <= raw[:n] && raw[:n] <= space.high { // see if value is in range text := raw[:n] raw = raw[n:] - for _, bf := range m.bfrange { - if len(bf.lo) == n && bf.lo <= text && text <= bf.hi { - if bf.dst.Kind() == String { - s := bf.dst.RawString() - if bf.lo != text { + for _, bfchar := range m.bfchar { // check for matching bfchar + if len(bfchar.orig) == n && bfchar.orig == text { + r = append(r, []rune(utf16Decode(bfchar.repl))...) + continue Parse + } + } + for _, bfrange := range m.bfrange { // check for matching bfrange + if len(bfrange.lo) == n && bfrange.lo <= text && text <= bfrange.hi { + if bfrange.dst.Kind() == String { + s := bfrange.dst.RawString() + if bfrange.lo != text { // value isn't at the beginning of the range so scale result b := []byte(s) - b[len(b)-1] += text[len(text)-1] - bf.lo[len(bf.lo)-1] + b[len(b)-1] += text[len(text)-1] - bfrange.lo[len(bfrange.lo)-1] // increment last byte by difference s = string(b) } r = append(r, []rune(utf16Decode(s))...) continue Parse } - if bf.dst.Kind() == Array { - fmt.Printf("array %v\n", bf.dst) + if bfrange.dst.Kind() == Array { + fmt.Printf("array %v\n", bfrange.dst) } else { - fmt.Printf("unknown dst %v\n", bf.dst) + fmt.Printf("unknown dst %v\n", bfrange.dst) } r = append(r, noRune) continue Parse } } - fmt.Printf("no text for %q", text) r = append(r, noRune) continue Parse } @@ -275,12 +331,6 @@ Parse: return string(r) } -type bfrange struct { - lo string - hi string - dst Value -} - func readCmap(toUnicode Value) *cmap { n := -1 var m cmap @@ -291,9 +341,8 @@ func readCmap(toUnicode Value) *cmap { } switch op { case "findresource": - category := stk.Pop() - key := stk.Pop() - fmt.Println("findresource", key, category) + stk.Pop() // category + stk.Pop() // key stk.Push(newDict()) case "begincmap": stk.Push(newDict()) @@ -314,9 +363,19 @@ func readCmap(toUnicode Value) *cmap { ok = false return } - m.space[len(lo)-1] = append(m.space[len(lo)-1], [2]string{lo, hi}) + m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi}) } n = -1 + case "beginbfchar": + n = int(stk.Pop().Int64()) + case "endbfchar": + if n < 0 { + panic("missing beginbfchar") + } + for i := 0; i < n; i++ { + repl, orig := stk.Pop().RawString(), stk.Pop().RawString() + m.bfchar = append(m.bfchar, bfchar{orig, repl}) + } case "beginbfrange": n = int(stk.Pop().Int64()) case "endbfrange": @@ -328,10 +387,9 @@ func readCmap(toUnicode Value) *cmap { m.bfrange = append(m.bfrange, bfrange{srcLo, srcHi, dst}) } case "defineresource": - category := stk.Pop().Name() + stk.Pop().Name() // category value := stk.Pop() - key := stk.Pop().Name() - fmt.Println("defineresource", key, value, category) + stk.Pop().Name() // key stk.Push(value) default: println("interp\t", op) @@ -401,6 +459,86 @@ type gstate struct { CTM matrix } +// GetPlainText returns the page's all text without format. +// fonts can be passed in (to improve parsing performance) or left nil +func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) { + defer func() { + if r := recover(); r != nil { + result = "" + err = errors.New(fmt.Sprint(r)) + } + }() + + strm := p.V.Key("Contents") + var enc TextEncoding = &nopEncoder{} + + if fonts == nil { + fonts = make(map[string]*Font) + for _, font := range p.Fonts() { + f := p.Font(font) + fonts[font] = &f + } + } + + var textBuilder bytes.Buffer + showText := func(s string) { + for _, ch := range enc.Decode(s) { + _, err := textBuilder.WriteRune(ch) + if err != nil { + panic(err) + } + } + } + + Interpret(strm, func(stk *Stack, op string) { + n := stk.Len() + args := make([]Value, n) + for i := n - 1; i >= 0; i-- { + args[i] = stk.Pop() + } + + switch op { + default: + return + case "T*": // move to start of next line + showText("\n") + case "Tf": // set text font and size + if len(args) != 2 { + panic("bad TL") + } + if font, ok := fonts[args[0].Name()]; ok { + enc = font.Encoder() + } else { + enc = &nopEncoder{} + } + case "\"": // set spacing, move to next line, and show text + if len(args) != 3 { + panic("bad \" operator") + } + fallthrough + case "'": // move to next line and show text + if len(args) != 1 { + panic("bad ' operator") + } + fallthrough + case "Tj": // show text + if len(args) != 1 { + panic("bad Tj operator") + } + showText(args[0].RawString()) + case "TJ": // show text, allowing individual glyph positioning + v := args[0] + for i := 0; i < v.Len(); i++ { + x := v.Index(i) + if x.Kind() == String { + showText(x.RawString()) + } + } + } + }) + return textBuilder.String(), nil +} + // Content returns the page's content. func (p Page) Content() Content { strm := p.V.Key("Contents") @@ -418,17 +556,14 @@ func (p Page) Content() Content { Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM) w0 := g.Tf.Width(int(s[n])) n++ - if ch != ' ' { - f := g.Tf.BaseFont() - if i := strings.Index(f, "+"); i >= 0 { - f = f[i+1:] - } - text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) + + f := g.Tf.BaseFont() + if i := strings.Index(f, "+"); i >= 0 { + f = f[i+1:] } + text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) + tx := w0/1000*g.Tfs + g.Tc - if ch == ' ' { - tx += g.Tw - } tx *= g.Th g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } @@ -567,6 +702,7 @@ func (p Page) Content() Content { g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } } + showText("\n") case "TL": // set text leading if len(args) != 1 { diff --git a/pdfpasswd/main.go b/pdfpasswd/main.go index 53c8ef1..57fa88f 100644 --- a/pdfpasswd/main.go +++ b/pdfpasswd/main.go @@ -12,7 +12,7 @@ import ( "log" "os" - "rsc.io/pdf" + "github.com/ledongthuc/pdf" ) var ( diff --git a/read.go b/read.go index eb8b9aa..f3ad3ed 100644 --- a/read.go +++ b/read.go @@ -44,7 +44,7 @@ // the package. Equally important, traversal of other PDF data structures can be implemented // in other packages as needed. // -package pdf // import "rsc.io/pdf" +package pdf // BUG(rsc): The package is incomplete, although it has been used successfully on some // large real-world PDF files.