From 0b366c361d3733177ad5f524a2b0839331dd0975 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 9 Sep 2023 12:44:20 -0700
Subject: [PATCH 1/2] Support for hf tokenizers, refs #8

---
 ttok/cli.py | 42 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/ttok/cli.py b/ttok/cli.py
index 693843e..3da981c 100644
--- a/ttok/cli.py
+++ b/ttok/cli.py
@@ -59,10 +59,7 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
         raise click.ClickException("Cannot use --decode with --encode")
     if as_tokens and not decode_tokens and not encode_tokens:
         encode_tokens = True
-    try:
-        encoding = tiktoken.encoding_for_model(model)
-    except KeyError as e:
-        raise click.ClickException(f"Invalid model: {model}") from e
+
     if not prompt and input is None:
         input = sys.stdin
     text = " ".join(prompt)
@@ -73,6 +70,43 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
         else:
             text = input_text
 
+    if model.startswith("hf:"):
+        # We use Hugging Face tokenizers instead
+        try:
+            import tokenizers
+        except ImportError:
+            raise click.ClickException("Hugging Face tokenizers is not installed")
+
+        hf_tokenizer = tokenizers.Tokenizer.from_pretrained(model[3:])
+        if decode_tokens:
+            tokens = [int(token) for token in re.findall(r"\d+", text)]
+            if as_tokens:
+                click.echo(hf_tokenizer.decode(tokens))
+            else:
+                click.echo(hf_tokenizer.decode(tokens))
+            return
+        else:
+            tokens = hf_tokenizer.encode(text).ids
+            if truncate:
+                tokens = tokens[:truncate]
+
+            if encode_tokens:
+                if as_tokens:
+                    click.echo(hf_tokenizer.decode(tokens))
+                else:
+                    click.echo(" ".join(str(t) for t in tokens))
+            elif truncate:
+                click.echo(hf_tokenizer.decode(tokens), nl=False)
+            else:
+                click.echo(len(tokens))
+            return
+
+    # Use tiktoken for OpenAI tokenizers instead
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError as e:
+        raise click.ClickException(f"Invalid model: {model}") from e
+
     if decode_tokens:
         tokens = [int(token) for token in re.findall(r"\d+", text)]
         if as_tokens:

From 552983f741acb1b046152f373187260333c6b386 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 2 May 2024 16:26:36 -0700
Subject: [PATCH 2/2] --allow-special option, closes #13

---
 README.md          |  1 +
 tests/test_ttok.py | 13 +++++++++++++
 ttok/cli.py        | 31 +++++++++++++++++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7644dd3..2b34f30 100644
--- a/README.md
+++ b/README.md
@@ -217,6 +217,7 @@ Options:
   --encode, --tokens      Output token integers
   --decode                Convert token integers to text
   --tokens                Output full tokens
+  --allow-special         Do not error on special tokens
   --help                  Show this message and exit.
 
 ```
diff --git a/tests/test_ttok.py b/tests/test_ttok.py
index d980ea0..1db5fd6 100644
--- a/tests/test_ttok.py
+++ b/tests/test_ttok.py
@@ -94,3 +94,16 @@ def test_ttok_file(use_stdin, use_extra_args):
         result = runner.invoke(cli, args, **kwargs)
         assert result.exit_code == 0
         assert result.output.strip() == str(expected_count)
+
+
+def test_ttok_special_tokens():
+    # https://github.com/simonw/ttok/issues/13
+    runner = CliRunner()
+    # Without --allow-special raises an error
+    result = runner.invoke(cli, ["<|endoftext|>", "--encode"])
+    assert result.exit_code != 0
+    assert "Use --allow-special to allow special tokens" in result.output
+    # With --allow-special it works
+    result = runner.invoke(cli, ["<|endoftext|>", "--encode", "--allow-special"])
+    assert result.exit_code == 0
+    assert result.output.strip() == "100257"
diff --git a/ttok/cli.py b/ttok/cli.py
index 3da981c..4739679 100644
--- a/ttok/cli.py
+++ b/ttok/cli.py
@@ -19,7 +19,17 @@
     "decode_tokens", "--decode", is_flag=True, help="Convert token integers to text"
 )
 @click.option("as_tokens", "--tokens", is_flag=True, help="Output full tokens")
-def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens):
+@click.option("--allow-special", is_flag=True, help="Do not error on special tokens")
+def cli(
+    prompt,
+    input,
+    truncate,
+    model,
+    encode_tokens,
+    decode_tokens,
+    as_tokens,
+    allow_special,
+):
     """
     Count and truncate text based on tokens
 
@@ -57,6 +67,10 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
     """
     if decode_tokens and encode_tokens:
         raise click.ClickException("Cannot use --decode with --encode")
+    if allow_special and not (encode_tokens or as_tokens):
+        raise click.ClickException(
+            "Cannot use --allow-special without --encode or --tokens"
+        )
     if as_tokens and not decode_tokens and not encode_tokens:
         encode_tokens = True
 
@@ -116,7 +130,20 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
         return
 
     # Tokenize it
-    tokens = encoding.encode(text)
+    kwargs = {}
+    if allow_special:
+        kwargs["allowed_special"] = "all"
+    try:
+        tokens = encoding.encode(text, **kwargs)
+    except ValueError as ex:
+        ex_str = str(ex)
+        if "disallowed special token" in ex_str and not allow_special:
+            # Just the first line, then add a hint
+            ex_str = (
+                ex_str.split("\n")[0]
+                + "\n\nUse --allow-special to allow special tokens"
+            )
+        raise click.ClickException(ex_str)
     if truncate:
         tokens = tokens[:truncate]