Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ Options:
--encode, --tokens Output token integers
--decode Convert token integers to text
--tokens Output full tokens
--allow-special Do not error on special tokens
--help Show this message and exit.

```
Expand Down
13 changes: 13 additions & 0 deletions tests/test_ttok.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,16 @@ def test_ttok_file(use_stdin, use_extra_args):
result = runner.invoke(cli, args, **kwargs)
assert result.exit_code == 0
assert result.output.strip() == str(expected_count)


def test_ttok_special_tokens():
# https://github.com/simonw/ttok/issues/13
runner = CliRunner()
# Without --allow-special raises an error
result = runner.invoke(cli, ["<|endoftext|>", "--encode"])
assert result.exit_code != 0
assert "Use --allow-special to allow special tokens" in result.output
# With --allow-special it works
result = runner.invoke(cli, ["<|endoftext|>", "--encode", "--allow-special"])
assert result.exit_code == 0
assert result.output.strip() == "100257"
73 changes: 67 additions & 6 deletions ttok/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@
"decode_tokens", "--decode", is_flag=True, help="Convert token integers to text"
)
@click.option("as_tokens", "--tokens", is_flag=True, help="Output full tokens")
def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens):
@click.option("--allow-special", is_flag=True, help="Do not error on special tokens")
def cli(
prompt,
input,
truncate,
model,
encode_tokens,
decode_tokens,
as_tokens,
allow_special,
):
"""
Count and truncate text based on tokens

Expand Down Expand Up @@ -57,12 +67,13 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
"""
if decode_tokens and encode_tokens:
raise click.ClickException("Cannot use --decode with --encode")
if allow_special and not (encode_tokens or as_tokens):
raise click.ClickException(
"Cannot use --allow-special without --encode or --tokens"
)
if as_tokens and not decode_tokens and not encode_tokens:
encode_tokens = True
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError as e:
raise click.ClickException(f"Invalid model: {model}") from e

if not prompt and input is None:
input = sys.stdin
text = " ".join(prompt)
Expand All @@ -73,6 +84,43 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
else:
text = input_text

if model.startswith("hf:"):
# We use Hugging Face tokenizers instead
try:
import tokenizers
except ImportError:
raise click.ClickException("Hugging Face tokenizers is not installed")

hf_tokenizer = tokenizers.Tokenizer.from_pretrained(model[3:])
if decode_tokens:
tokens = [int(token) for token in re.findall(r"\d+", text)]
if as_tokens:
click.echo(hf_tokenizer.decode(tokens))
else:
click.echo(hf_tokenizer.decode(tokens))
return
else:
tokens = hf_tokenizer.encode(text).ids
if truncate:
tokens = tokens[:truncate]

if encode_tokens:
if as_tokens:
click.echo(hf_tokenizer.decode(tokens))
else:
click.echo(" ".join(str(t) for t in tokens))
elif truncate:
click.echo(hf_tokenizer.decode(tokens), nl=False)
else:
click.echo(len(tokens))
return

# Use tiktoken for OpenAI tokenizers instead
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError as e:
raise click.ClickException(f"Invalid model: {model}") from e

if decode_tokens:
tokens = [int(token) for token in re.findall(r"\d+", text)]
if as_tokens:
Expand All @@ -82,7 +130,20 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
return

# Tokenize it
tokens = encoding.encode(text)
kwargs = {}
if allow_special:
kwargs["allowed_special"] = "all"
try:
tokens = encoding.encode(text, **kwargs)
except ValueError as ex:
ex_str = str(ex)
if "disallowed special token" in ex_str and not allow_special:
# Just the first line, then add a hint
ex_str = (
ex_str.split("\n")[0]
+ "\n\nUse --allow-special to allow special tokens"
)
raise click.ClickException(ex_str)
if truncate:
tokens = tokens[:truncate]

Expand Down