From 0b366c361d3733177ad5f524a2b0839331dd0975 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 9 Sep 2023 12:44:20 -0700 Subject: [PATCH 1/2] Support for hf tokenizers, refs #8 --- ttok/cli.py | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/ttok/cli.py b/ttok/cli.py index 693843e..3da981c 100644 --- a/ttok/cli.py +++ b/ttok/cli.py @@ -59,10 +59,7 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens) raise click.ClickException("Cannot use --decode with --encode") if as_tokens and not decode_tokens and not encode_tokens: encode_tokens = True - try: - encoding = tiktoken.encoding_for_model(model) - except KeyError as e: - raise click.ClickException(f"Invalid model: {model}") from e + if not prompt and input is None: input = sys.stdin text = " ".join(prompt) @@ -73,6 +70,43 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens) else: text = input_text + if model.startswith("hf:"): + # We use Hugging Face tokenizers instead + try: + import tokenizers + except ImportError: + raise click.ClickException("Hugging Face tokenizers is not installed") + + hf_tokenizer = tokenizers.Tokenizer.from_pretrained(model[3:]) + if decode_tokens: + tokens = [int(token) for token in re.findall(r"\d+", text)] + if as_tokens: + click.echo(hf_tokenizer.decode(tokens)) + else: + click.echo(hf_tokenizer.decode(tokens)) + return + else: + tokens = hf_tokenizer.encode(text).ids + if truncate: + tokens = tokens[:truncate] + + if encode_tokens: + if as_tokens: + click.echo(hf_tokenizer.decode(tokens)) + else: + click.echo(" ".join(str(t) for t in tokens)) + elif truncate: + click.echo(hf_tokenizer.decode(tokens), nl=False) + else: + click.echo(len(tokens)) + return + + # Use tiktoken for OpenAI tokenizers instead + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError as e: + raise click.ClickException(f"Invalid model: {model}") from e + if decode_tokens: tokens = [int(token) for token in re.findall(r"\d+", text)] if as_tokens: From 552983f741acb1b046152f373187260333c6b386 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 2 May 2024 16:26:36 -0700 Subject: [PATCH 2/2] --allow-special option, closes #13 --- README.md | 1 + tests/test_ttok.py | 13 +++++++++++++ ttok/cli.py | 31 +++++++++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7644dd3..2b34f30 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ Options: --encode, --tokens Output token integers --decode Convert token integers to text --tokens Output full tokens + --allow-special Do not error on special tokens --help Show this message and exit. ``` diff --git a/tests/test_ttok.py b/tests/test_ttok.py index d980ea0..1db5fd6 100644 --- a/tests/test_ttok.py +++ b/tests/test_ttok.py @@ -94,3 +94,16 @@ def test_ttok_file(use_stdin, use_extra_args): result = runner.invoke(cli, args, **kwargs) assert result.exit_code == 0 assert result.output.strip() == str(expected_count) + + +def test_ttok_special_tokens(): + # https://github.com/simonw/ttok/issues/13 + runner = CliRunner() + # Without --allow-special raises an error + result = runner.invoke(cli, ["<|endoftext|>", "--encode"]) + assert result.exit_code != 0 + assert "Use --allow-special to allow special tokens" in result.output + # With --allow-special it works + result = runner.invoke(cli, ["<|endoftext|>", "--encode", "--allow-special"]) + assert result.exit_code == 0 + assert result.output.strip() == "100257" diff --git a/ttok/cli.py b/ttok/cli.py index 3da981c..4739679 100644 --- a/ttok/cli.py +++ b/ttok/cli.py @@ -19,7 +19,17 @@ "decode_tokens", "--decode", is_flag=True, help="Convert token integers to text" ) @click.option("as_tokens", "--tokens", is_flag=True, help="Output full tokens") -def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens): +@click.option("--allow-special", is_flag=True, help="Do not error on special tokens") +def cli( + prompt, + input, + truncate, + model, + encode_tokens, + decode_tokens, + as_tokens, + allow_special, +): """ Count and truncate text based on tokens @@ -57,6 +67,10 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens) """ if decode_tokens and encode_tokens: raise click.ClickException("Cannot use --decode with --encode") + if allow_special and not (encode_tokens or as_tokens): + raise click.ClickException( + "Cannot use --allow-special without --encode or --tokens" + ) if as_tokens and not decode_tokens and not encode_tokens: encode_tokens = True @@ -116,7 +130,20 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens) return # Tokenize it - tokens = encoding.encode(text) + kwargs = {} + if allow_special: + kwargs["allowed_special"] = "all" + try: + tokens = encoding.encode(text, **kwargs) + except ValueError as ex: + ex_str = str(ex) + if "disallowed special token" in ex_str and not allow_special: + # Just the first line, then add a hint + ex_str = ( + ex_str.split("\n")[0] + + "\n\nUse --allow-special to allow special tokens" + ) + raise click.ClickException(ex_str) if truncate: tokens = tokens[:truncate]