From 56bd0c65c48ff0efdd884e731fd0fa5b04c5beef Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Sun, 12 Mar 2017 17:19:49 +0700
Subject: [PATCH 01/17] Update the purpose of the fork

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 902a7e1..9742584 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
-go get rsc.io/pdf
+# Purpose of the fork
 
-http://godoc.org/rsc.io/pdf
+This fork of rsc.io/pdf extends the package API with:
+
+  - Implement the method GetPlainText() from object Page. Use to get plain text content (without format)

From 612c19099809c99bd4595ee711560b748bd0ae2a Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Sun, 12 Mar 2017 20:33:21 +0700
Subject: [PATCH 02/17] Change the import path

---
 pdfpasswd/main.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pdfpasswd/main.go b/pdfpasswd/main.go
index 53c8ef1..57fa88f 100644
--- a/pdfpasswd/main.go
+++ b/pdfpasswd/main.go
@@ -12,7 +12,7 @@ import (
 	"log"
 	"os"
 
-	"rsc.io/pdf"
+	"github.com/ledongthuc/pdf"
 )
 
 var (

From b95967f4ea5d295d627d5767e70b1b4ddbd05f7d Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Sun, 12 Mar 2017 21:50:42 +0700
Subject: [PATCH 03/17] Add function to get plain text from Page

---
 page.go | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/page.go b/page.go
index 9c7d688..c7a80ac 100644
--- a/page.go
+++ b/page.go
@@ -5,6 +5,7 @@
 package pdf
 
 import (
+	"bytes"
 	"fmt"
 	"strings"
 )
@@ -401,6 +402,61 @@ type gstate struct {
 	CTM   matrix
 }
 
+// GetPlainText returns the page's all text without format.
+//  - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try.
+func (p Page) GetPlainText(seperator string) string {
+	strm := p.V.Key("Contents")
+
+	var textBuilder bytes.Buffer
+	showText := func(s string) {
+		_, err := textBuilder.WriteString(s)
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	Interpret(strm, func(stk *Stack, op string) {
+		n := stk.Len()
+		args := make([]Value, n)
+		for i := n - 1; i >= 0; i-- {
+			args[i] = stk.Pop()
+		}
+
+		switch op {
+		default:
+			return
+		case "T*": // move to start of next line
+			showText(seperator)
+		case "\"": // set spacing, move to next line, and show text
+			if len(args) != 3 {
+				panic("bad \" operator")
+			}
+			fallthrough
+		case "'": // move to next line and show text
+			if len(args) != 1 {
+				panic("bad ' operator")
+			}
+			fallthrough
+		case "Tj": // show text
+			if len(args) != 1 {
+				panic("bad Tj operator")
+			}
+			showText(args[0].RawString())
+			showText(seperator)
+		case "TJ": // show text, allowing individual glyph positioning
+			v := args[0]
+			for i := 0; i < v.Len(); i++ {
+				x := v.Index(i)
+				if x.Kind() == String {
+					showText(x.RawString())
+					showText(seperator)
+				}
+			}
+		}
+	})
+	return textBuilder.String()
+}
+
 // Content returns the page's content.
 func (p Page) Content() Content {
 	strm := p.V.Key("Contents")

From daace13046a1d74935e3d38767d83943d9f63155 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 13 Mar 2017 15:53:10 +0700
Subject: [PATCH 04/17] Remove comment of rsc

---
 read.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read.go b/read.go
index eb8b9aa..f3ad3ed 100644
--- a/read.go
+++ b/read.go
@@ -44,7 +44,7 @@
 // the package. Equally important, traversal of other PDF data structures can be implemented
 // in other packages as needed.
 //
-package pdf // import "rsc.io/pdf"
+package pdf
 
 // BUG(rsc): The package is incomplete, although it has been used successfully on some
 // large real-world PDF files.

From 0e30ba212a76647ebf25d15ff1e0a50ca5f44971 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 13 Mar 2017 16:58:03 +0700
Subject: [PATCH 05/17] Update

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 9742584..cfa0d9c 100644
--- a/README.md
+++ b/README.md
@@ -3,3 +3,27 @@
 This fork of rsc.io/pdf extends the package API with:
 
   - Implement the method GetPlainText() from object Page. Use to get plain text content (without format)
+
+## How to read all text from PDF:
+
+I write an example function to read file from PATH and return the content of PDF
+
+    ```golang
+    func readPdf(path string) (string, error) {
+      r, err := pdf.Open(path)
+      if err != nil {
+        return "", err
+      }
+      totalPage := r.NumPage()
+
+      var textBuilder bytes.Buffer
+      for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
+        p := r.Page(pageIndex)
+        if p.V.IsNull() {
+          continue
+        }
+        textBuilder.WriteString(p.GetPlainText("\n"))
+      }
+      return textBuilder.String(), nil
+    }
+    ```

From ffbf376ba4dfa5945fd52fd1dbce9468a6f98342 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 13 Mar 2017 16:58:42 +0700
Subject: [PATCH 06/17] Correct the language of code block example

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cfa0d9c..97860b7 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with:
 
 I write an example function to read file from PATH and return the content of PDF
 
-    ```golang
+    ```go
     func readPdf(path string) (string, error) {
       r, err := pdf.Open(path)
       if err != nil {

From f8f8fe4f600c77e16df2d1121cec055de53192c7 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 13 Mar 2017 16:59:16 +0700
Subject: [PATCH 07/17] Update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 97860b7..7894a4a 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with:
 
 I write an example function to read file from PATH and return the content of PDF
 
-    ```go
+    ```
     func readPdf(path string) (string, error) {
       r, err := pdf.Open(path)
       if err != nil {

From d6cc51520d9495c45daa3ca4b69af29f67ce53bb Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 13 Mar 2017 17:00:37 +0700
Subject: [PATCH 08/17] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7894a4a..587e6e6 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with:
 
 I write an example function to read file from PATH and return the content of PDF
 
-    ```
+```golang
     func readPdf(path string) (string, error) {
       r, err := pdf.Open(path)
       if err != nil {
@@ -26,4 +26,4 @@ I write an example function to read file from PATH and return the content of PDF
       }
       return textBuilder.String(), nil
     }
-    ```
+```

From f3eb144855fbc8a739170bd4f661c75e8979f5f9 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Tue, 14 Mar 2017 10:46:20 +0700
Subject: [PATCH 09/17] Update README.md

---
 README.md | 55 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 587e6e6..167133b 100644
--- a/README.md
+++ b/README.md
@@ -9,21 +9,42 @@ This fork of rsc.io/pdf extends the package API with:
 I write an example function to read file from PATH and return the content of PDF
 
 ```golang
-    func readPdf(path string) (string, error) {
-      r, err := pdf.Open(path)
-      if err != nil {
-        return "", err
-      }
-      totalPage := r.NumPage()
-
-      var textBuilder bytes.Buffer
-      for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
-        p := r.Page(pageIndex)
-        if p.V.IsNull() {
-          continue
-        }
-        textBuilder.WriteString(p.GetPlainText("\n"))
-      }
-      return textBuilder.String(), nil
-    }
+package main
+
+import (
+	"bytes"
+	"fmt"
+
+	"github.com/ledongthuc/pdf"
+)
+
+func main() {
+	content, err := readPdf("test.pdf") // Read local pdf file
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println(content)
+	return
+}
+
+func readPdf(path string) (string, error) {
+	r, err := pdf.Open(path)
+	if err != nil {
+		return "", err
+	}
+	totalPage := r.NumPage()
+
+	var textBuilder bytes.Buffer
+	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
+		p := r.Page(pageIndex)
+		if p.V.IsNull() {
+			continue
+		}
+		textBuilder.WriteString(p.GetPlainText("\n"))
+	}
+	return textBuilder.String(), nil
+}
 ```
+
+## Demo
+![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif)

From 66da04eb56a952ee1e98370590103cee2c61b3ff Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Tue, 14 Mar 2017 10:51:04 +0700
Subject: [PATCH 10/17] Update README.md

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 167133b..7916174 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,10 @@ This fork of rsc.io/pdf extends the package API with:
 
 ## How to read all text from PDF:
 
-I write an example function to read file from PATH and return the content of PDF
+1. Get the library with command `go get -u github.com/ledongthuc/pdf`
+
+
+2. I write an example function to read file from PATH and return the content of PDF
 
 ```golang
 package main

From 4ff10c65aed6fff2bf0eda2611c97e990caa6377 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 19 Jun 2017 07:13:38 +0700
Subject: [PATCH 11/17] Add space when get text from Content()

    Based on pull request of https://github.com/rsc/pdf/pull/8
    but never merged. So I need it :(
---
 page.go | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/page.go b/page.go
index c7a80ac..7e34a8e 100644
--- a/page.go
+++ b/page.go
@@ -474,17 +474,14 @@ func (p Page) Content() Content {
 			Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM)
 			w0 := g.Tf.Width(int(s[n]))
 			n++
-			if ch != ' ' {
-				f := g.Tf.BaseFont()
-				if i := strings.Index(f, "+"); i >= 0 {
-					f = f[i+1:]
-				}
-				text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)})
+
+			f := g.Tf.BaseFont()
+			if i := strings.Index(f, "+"); i >= 0 {
+				f = f[i+1:]
 			}
+			text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)})
+
 			tx := w0/1000*g.Tfs + g.Tc
-			if ch == ' ' {
-				tx += g.Tw
-			}
 			tx *= g.Th
 			g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
 		}

From 1e8ebfa8c2834dd64e93fd39717a7d8d9edeb897 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Mon, 19 Jun 2017 07:28:55 +0700
Subject: [PATCH 12/17] Add readme content to get texts with style

---
 README.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/README.md b/README.md
index 7916174..b30005c 100644
--- a/README.md
+++ b/README.md
@@ -49,5 +49,35 @@ func readPdf(path string) (string, error) {
 }
 ```
 
+## How to read all text with styles from PDF
+
+```golang
+func readPdf2(path string) (string, error) {
+	r, err := pdf.Open(path)
+	if err != nil {
+		return "", err
+	}
+	totalPage := r.NumPage()
+
+	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
+		p := r.Page(pageIndex)
+		if p.V.IsNull() {
+			continue
+		}
+		var lastTextStyle pdf.Text
+		texts := p.Content().Text
+		for _, text := range texts {
+			if isSameSentence(text, lastTextStyle) {
+				lastTextStyle.S = lastTextStyle.S + text.S
+			} else {
+				fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
+				lastTextStyle = text
+			}
+		}
+	}
+	return "", nil
+}
+```
+
 ## Demo
 ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif)

From fbd875511ef56a0e84d6a779ad105687e86707c7 Mon Sep 17 00:00:00 2001
From: Thuc Le <ledongthuc@gmail.com>
Date: Sun, 2 Jul 2017 14:37:26 +0700
Subject: [PATCH 13/17] Update

---
 page.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/page.go b/page.go
index 7e34a8e..7bb3d43 100644
--- a/page.go
+++ b/page.go
@@ -620,6 +620,7 @@ func (p Page) Content() Content {
 					g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
 				}
 			}
+			showText("\n")
 
 		case "TL": // set text leading
 			if len(args) != 1 {

From 11f580bd1d786f4d02ee0696a966dc2c687b718e Mon Sep 17 00:00:00 2001
From: Rob Archibald <rob.archibald@endfirst.com>
Date: Thu, 17 Aug 2017 19:29:14 -0700
Subject: [PATCH 14/17] Add GetPlainText to Reader. Fix Encoder method

---
 README.md |  29 +++++------
 page.go   | 152 +++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 123 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index b30005c..76f33e1 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,17 @@
-# Purpose of the fork
+# PDF Reader
 
-This fork of rsc.io/pdf extends the package API with:
+A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf
 
-  - Implement the method GetPlainText() from object Page. Use to get plain text content (without format)
+Features
+  - Get plain text content (without format)
+  - Get Content (including all font and formatting information)
 
-## How to read all text from PDF:
+## Install:
 
-1. Get the library with command `go get -u github.com/ledongthuc/pdf`
+`go get -u github.com/ledongthuc/pdf`
 
 
-2. I write an example function to read file from PATH and return the content of PDF
+## Read plain text
 
 ```golang
 package main
@@ -35,21 +37,14 @@ func readPdf(path string) (string, error) {
 	if err != nil {
 		return "", err
 	}
-	totalPage := r.NumPage()
 
-	var textBuilder bytes.Buffer
-	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
-		p := r.Page(pageIndex)
-		if p.V.IsNull() {
-			continue
-		}
-		textBuilder.WriteString(p.GetPlainText("\n"))
-	}
-	return textBuilder.String(), nil
+	var buf bytes.Buffer
+	buf.ReadFrom(p.GetPlainText())
+	return buf.String(), nil
 }
 ```
 
-## How to read all text with styles from PDF
+## Read all text with styles from PDF
 
 ```golang
 func readPdf2(path string) (string, error) {
diff --git a/page.go b/page.go
index 7bb3d43..e330bc4 100644
--- a/page.go
+++ b/page.go
@@ -7,6 +7,7 @@ package pdf
 import (
 	"bytes"
 	"fmt"
+	"io"
 	"strings"
 )
 
@@ -56,6 +57,24 @@ func (r *Reader) NumPage() int {
 	return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64())
 }
 
+// GetPlainText returns all the text in the PDF file
+func (r *Reader) GetPlainText() io.Reader {
+	pages := r.NumPage()
+	var buf bytes.Buffer
+	fonts := make(map[string]*Font)
+	for i := 1; i < pages; i++ {
+		p := r.Page(i)
+		for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap
+			if _, ok := fonts[name]; !ok {
+				f := p.Font(name)
+				fonts[name] = &f
+			}
+		}
+		buf.WriteString(p.GetPlainText(fonts))
+	}
+	return &buf
+}
+
 func (p Page) findInherited(key string) Value {
 	for v := p.V; !v.IsNull(); v = v.Key("Parent") {
 		if r := v.Key(key); !r.IsNull() {
@@ -87,13 +106,14 @@ func (p Page) Fonts() []string {
 
 // Font returns the font with the given name associated with the page.
 func (p Page) Font(name string) Font {
-	return Font{p.Resources().Key("Font").Key(name)}
+	return Font{p.Resources().Key("Font").Key(name), nil}
 }
 
 // A Font represent a font in a PDF file.
 // The methods interpret a Font dictionary stored in V.
 type Font struct {
-	V Value
+	V   Value
+	enc TextEncoding
 }
 
 // BaseFont returns the font's name (BaseFont property).
@@ -134,6 +154,13 @@ func (f Font) Width(code int) float64 {
 
 // Encoder returns the encoding between font code point sequences and UTF-8.
 func (f Font) Encoder() TextEncoding {
+	if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap
+		f.enc = f.getEncoder()
+	}
+	return f.enc
+}
+
+func (f Font) getEncoder() TextEncoding {
 	enc := f.V.Key("Encoding")
 	switch enc.Kind() {
 	case Name:
@@ -143,8 +170,7 @@ func (f Font) Encoder() TextEncoding {
 		case "MacRomanEncoding":
 			return &byteEncoder{&macRomanEncoding}
 		case "Identity-H":
-			// TODO: Should be big-endian UCS-2 decoder
-			return &nopEncoder{}
+			return f.charmapEncoding()
 		default:
 			println("unknown encoding", enc.Name())
 			return &nopEncoder{}
@@ -152,14 +178,16 @@ func (f Font) Encoder() TextEncoding {
 	case Dict:
 		return &dictEncoder{enc.Key("Differences")}
 	case Null:
-		// ok, try ToUnicode
+		return f.charmapEncoding()
 	default:
 		println("unexpected encoding", enc.String())
 		return &nopEncoder{}
 	}
+}
 
+func (f *Font) charmapEncoding() TextEncoding {
 	toUnicode := f.V.Key("ToUnicode")
-	if toUnicode.Kind() == Dict {
+	if toUnicode.Kind() == Stream {
 		m := readCmap(toUnicode)
 		if m == nil {
 			return &nopEncoder{}
@@ -228,42 +256,64 @@ func (e *byteEncoder) Decode(raw string) (text string) {
 	return string(r)
 }
 
+type byteRange struct {
+	low  string
+	high string
+}
+
+type bfchar struct {
+	orig string
+	repl string
+}
+
+type bfrange struct {
+	lo  string
+	hi  string
+	dst Value
+}
+
 type cmap struct {
-	space   [4][][2]string
+	space   [4][]byteRange // codespace range
 	bfrange []bfrange
+	bfchar  []bfchar
 }
 
 func (m *cmap) Decode(raw string) (text string) {
 	var r []rune
 Parse:
 	for len(raw) > 0 {
-		for n := 1; n <= 4 && n <= len(raw); n++ {
-			for _, space := range m.space[n-1] {
-				if space[0] <= raw[:n] && raw[:n] <= space[1] {
+		for n := 1; n <= 4 && n <= len(raw); n++ { // number of digits in character replacement (1-4 possible)
+			for _, space := range m.space[n-1] { // find matching codespace Ranges for number of digits
+				if space.low <= raw[:n] && raw[:n] <= space.high { // see if value is in range
 					text := raw[:n]
 					raw = raw[n:]
-					for _, bf := range m.bfrange {
-						if len(bf.lo) == n && bf.lo <= text && text <= bf.hi {
-							if bf.dst.Kind() == String {
-								s := bf.dst.RawString()
-								if bf.lo != text {
+					for _, bfchar := range m.bfchar { // check for matching bfchar
+						if len(bfchar.orig) == n && bfchar.orig == text {
+							r = append(r, []rune(utf16Decode(bfchar.repl))...)
+							continue Parse
+						}
+					}
+					for _, bfrange := range m.bfrange { // check for matching bfrange
+						if len(bfrange.lo) == n && bfrange.lo <= text && text <= bfrange.hi {
+							if bfrange.dst.Kind() == String {
+								s := bfrange.dst.RawString()
+								if bfrange.lo != text { // value isn't at the beginning of the range so scale result
 									b := []byte(s)
-									b[len(b)-1] += text[len(text)-1] - bf.lo[len(bf.lo)-1]
+									b[len(b)-1] += text[len(text)-1] - bfrange.lo[len(bfrange.lo)-1] // increment last byte by difference
 									s = string(b)
 								}
 								r = append(r, []rune(utf16Decode(s))...)
 								continue Parse
 							}
-							if bf.dst.Kind() == Array {
-								fmt.Printf("array %v\n", bf.dst)
+							if bfrange.dst.Kind() == Array {
+								fmt.Printf("array %v\n", bfrange.dst)
 							} else {
-								fmt.Printf("unknown dst %v\n", bf.dst)
+								fmt.Printf("unknown dst %v\n", bfrange.dst)
 							}
 							r = append(r, noRune)
 							continue Parse
 						}
 					}
-					fmt.Printf("no text for %q", text)
 					r = append(r, noRune)
 					continue Parse
 				}
@@ -276,12 +326,6 @@ Parse:
 	return string(r)
 }
 
-type bfrange struct {
-	lo  string
-	hi  string
-	dst Value
-}
-
 func readCmap(toUnicode Value) *cmap {
 	n := -1
 	var m cmap
@@ -292,9 +336,8 @@ func readCmap(toUnicode Value) *cmap {
 		}
 		switch op {
 		case "findresource":
-			category := stk.Pop()
-			key := stk.Pop()
-			fmt.Println("findresource", key, category)
+			stk.Pop() // category
+			stk.Pop() // key
 			stk.Push(newDict())
 		case "begincmap":
 			stk.Push(newDict())
@@ -315,9 +358,19 @@ func readCmap(toUnicode Value) *cmap {
 					ok = false
 					return
 				}
-				m.space[len(lo)-1] = append(m.space[len(lo)-1], [2]string{lo, hi})
+				m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi})
 			}
 			n = -1
+		case "beginbfchar":
+			n = int(stk.Pop().Int64())
+		case "endbfchar":
+			if n < 0 {
+				panic("missing beginbfchar")
+			}
+			for i := 0; i < n; i++ {
+				repl, orig := stk.Pop().RawString(), stk.Pop().RawString()
+				m.bfchar = append(m.bfchar, bfchar{orig, repl})
+			}
 		case "beginbfrange":
 			n = int(stk.Pop().Int64())
 		case "endbfrange":
@@ -329,10 +382,9 @@ func readCmap(toUnicode Value) *cmap {
 				m.bfrange = append(m.bfrange, bfrange{srcLo, srcHi, dst})
 			}
 		case "defineresource":
-			category := stk.Pop().Name()
+			stk.Pop().Name() // category
 			value := stk.Pop()
-			key := stk.Pop().Name()
-			fmt.Println("defineresource", key, value, category)
+			stk.Pop().Name() // key
 			stk.Push(value)
 		default:
 			println("interp\t", op)
@@ -403,15 +455,26 @@ type gstate struct {
 }
 
 // GetPlainText returns the page's all text without format.
-//  - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try.
-func (p Page) GetPlainText(seperator string) string {
+// fonts can be passed in (to improve parsing performance) or left nil
+func (p Page) GetPlainText(fonts map[string]*Font) string {
 	strm := p.V.Key("Contents")
+	var enc TextEncoding = &nopEncoder{}
+
+	if fonts == nil {
+		fonts = make(map[string]*Font)
+		for _, font := range p.Fonts() {
+			f := p.Font(font)
+			fonts[font] = &f
+		}
+	}
 
 	var textBuilder bytes.Buffer
 	showText := func(s string) {
-		_, err := textBuilder.WriteString(s)
-		if err != nil {
-			panic(err)
+		for _, ch := range enc.Decode(s) {
+			_, err := textBuilder.WriteRune(ch)
+			if err != nil {
+				panic(err)
+			}
 		}
 	}
 
@@ -426,7 +489,16 @@ func (p Page) GetPlainText(seperator string) string {
 		default:
 			return
 		case "T*": // move to start of next line
-			showText(seperator)
+			showText("\n")
+		case "Tf": // set text font and size
+			if len(args) != 2 {
+				panic("bad TL")
+			}
+			if font, ok := fonts[args[0].Name()]; ok {
+				enc = font.Encoder()
+			} else {
+				enc = &nopEncoder{}
+			}
 		case "\"": // set spacing, move to next line, and show text
 			if len(args) != 3 {
 				panic("bad \" operator")
@@ -442,14 +514,12 @@ func (p Page) GetPlainText(seperator string) string {
 				panic("bad Tj operator")
 			}
 			showText(args[0].RawString())
-			showText(seperator)
 		case "TJ": // show text, allowing individual glyph positioning
 			v := args[0]
 			for i := 0; i < v.Len(); i++ {
 				x := v.Index(i)
 				if x.Kind() == String {
 					showText(x.RawString())
-					showText(seperator)
 				}
 			}
 		}

From 4e83a7495507eca29a8fb782f1b5dabcfd66eaef Mon Sep 17 00:00:00 2001
From: Ivin Polo Sony <ivinpolosony@gmail.com>
Date: Wed, 30 Aug 2017 14:06:18 +0100
Subject: [PATCH 15/17] Update README.md

p variable in undefined in the function readPdf
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 76f33e1..fc2078f 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ func readPdf(path string) (string, error) {
 	}
 
 	var buf bytes.Buffer
-	buf.ReadFrom(p.GetPlainText())
+	buf.ReadFrom(r.GetPlainText())
 	return buf.String(), nil
 }
 ```

From 48f0c0bb4aeb6c118f470f6624cbb7b708e80289 Mon Sep 17 00:00:00 2001
From: Rik Vanmechelen <rik.vanmechelen@gmail.com>
Date: Tue, 30 Jan 2018 13:37:34 -0500
Subject: [PATCH 16/17] page index starts at 1

---
 page.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/page.go b/page.go
index e330bc4..9f7cd0e 100644
--- a/page.go
+++ b/page.go
@@ -62,7 +62,7 @@ func (r *Reader) GetPlainText() io.Reader {
 	pages := r.NumPage()
 	var buf bytes.Buffer
 	fonts := make(map[string]*Font)
-	for i := 1; i < pages; i++ {
+	for i := 1; i <= pages; i++ {
 		p := r.Page(i)
 		for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap
 			if _, ok := fonts[name]; !ok {

From 2deaee226449b1ea6447410a5351e1f6001c0e44 Mon Sep 17 00:00:00 2001
From: Peter Longyear <peter_longyear@moma.org>
Date: Mon, 26 Mar 2018 14:54:18 -0400
Subject: [PATCH 17/17] Recover from panics when getting plain text

---
 page.go | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/page.go b/page.go
index 9f7cd0e..29de1d6 100644
--- a/page.go
+++ b/page.go
@@ -6,6 +6,7 @@ package pdf
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"strings"
@@ -58,7 +59,7 @@ func (r *Reader) NumPage() int {
 }
 
 // GetPlainText returns all the text in the PDF file
-func (r *Reader) GetPlainText() io.Reader {
+func (r *Reader) GetPlainText() (reader io.Reader, err error) {
 	pages := r.NumPage()
 	var buf bytes.Buffer
 	fonts := make(map[string]*Font)
@@ -70,9 +71,13 @@ func (r *Reader) GetPlainText() io.Reader {
 				fonts[name] = &f
 			}
 		}
-		buf.WriteString(p.GetPlainText(fonts))
+		text, err := p.GetPlainText(fonts)
+		if err != nil {
+			return &bytes.Buffer{}, err
+		}
+		buf.WriteString(text)
 	}
-	return &buf
+	return &buf, nil
 }
 
 func (p Page) findInherited(key string) Value {
@@ -456,7 +461,14 @@ type gstate struct {
 
 // GetPlainText returns the page's all text without format.
 // fonts can be passed in (to improve parsing performance) or left nil
-func (p Page) GetPlainText(fonts map[string]*Font) string {
+func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			result = ""
+			err = errors.New(fmt.Sprint(r))
+		}
+	}()
+
 	strm := p.V.Key("Contents")
 	var enc TextEncoding = &nopEncoder{}
 
@@ -524,7 +536,7 @@ func (p Page) GetPlainText(fonts map[string]*Font) string {
 			}
 		}
 	})
-	return textBuilder.String()
+	return textBuilder.String(), nil
 }
 
 // Content returns the page's content.