diff --git a/README.md b/README.md index 5ca28e1..7c4b6be 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ [![Contributors](https://img.shields.io/github/contributors/dmitry-brazhenko/SharpToken.svg)](https://github.com/dmitry-brazhenko/SharpToken/graphs/contributors) [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) - - SharpToken is a C# library that serves as a port of the Python [tiktoken](https://github.com/openai/tiktoken) library. It provides functionality for encoding and decoding tokens using GPT-based encodings. This library is built for .NET 6, .NET 8 and .NET Standard 2.0, making it compatible with a wide range of frameworks. @@ -74,11 +72,12 @@ var count = encoding.CountTokens("Hello, world!"); // Output: 4 SharpToken currently supports the following models: -* `r50k_base` -* `p50k_base` -* `p50k_edit` -* `cl100k_base` -* `o200k_base` +- `r50k_base` +- `p50k_base` +- `p50k_edit` +- `cl100k_base` +- `o200k_base` +- `o200k_harmony` You can use any of these models when creating an instance of GptEncoding: @@ -88,6 +87,7 @@ var p50kBaseEncoding = GptEncoding.GetEncoding("p50k_base"); var p50kEditEncoding = GptEncoding.GetEncoding("p50k_edit"); var cl100kBaseEncoding = GptEncoding.GetEncoding("cl100k_base"); var o200kBaseEncoding = GptEncoding.GetEncoding("o200k_base"); +var o200kHarmonyEncoding = GptEncoding.GetEncoding("o200k_harmony"); ``` ### Model Prefix Matching @@ -96,14 +96,17 @@ Apart from specifying direct model names, SharpToken also provides functionality Here are the current supported prefixes and their corresponding encodings: -| Model Prefix | Encoding | -|---------------------|------------| -| `gpt-4o` | `o200k_base` | -| `gpt-4-` | `cl100k_base` | -| `gpt-3.5-turbo-` | `cl100k_base` | -| `gpt-35-turbo` | `cl100k_base` | +| Model Prefix | Encoding | +| ---------------- | ------------- | +| `gpt-5` | `o200k_base` | +| `gpt-4o` | `o200k_base` | +| `gpt-4-` | `cl100k_base` | +| `gpt-3.5-turbo-` | `cl100k_base` | +| `gpt-35-turbo` | `cl100k_base` | Examples of model names that fall under these prefixes include: + +- For the prefix `gpt-5`: `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-pro`, `gpt-5-thinking`, `gpt-5-2024-08-07`, `gpt-5-chat-latest`, etc. - For the prefix `gpt-4o`: `gpt-4o`, `gpt-4o-2024-05-13`, etc. - For the prefix `gpt-4-`: `gpt-4-0314`, `gpt-4-32k`, etc. - For the prefix `gpt-3.5-turbo-`: `gpt-3.5-turbo-0301`, `gpt-3.5-turbo-0401`, etc. @@ -117,9 +120,6 @@ string encodingName = Model.GetEncodingNameForModel("gpt-4-0314"); // This will If the provided model name doesn't match any direct model names or prefixes, the method will return `null`. - - - ## Understanding Encoded Values When you encode a string using the Encode method, the returned value is a list of integers that represent tokens in the @@ -289,23 +289,23 @@ BenchmarkDotNet v0.13.9+228a464e8be6c580ad9408e98f18813f6407fb5a, Windows 11 (10 .NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256 ``` -| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated | -|------------------ |--------------------- |--------------------- |----------:|---------:|----------:|----------:|-----------:|----------:|----------:| -| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB | -| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB | -| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB | -| | | | | | | | | | | -| *SharpToken* | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB | -| *SharpToken* | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB | -| *SharpToken* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB | -| | | | | | | | | | | -| *TokenizerLib* | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB | -| *TokenizerLib* | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB | -| *TokenizerLib* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB | -| | | | | | | | | | | -| *TiktokenSharp* | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB | -| *TiktokenSharp* | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB | -| *TiktokenSharp* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB | +| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated | +| ---------------- | -------------------- | -------------------- | --------: | -------: | --------: | --------: | ---------: | --------: | --------: | +| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB | +| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB | +| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB | +| | | | | | | | | | | +| _SharpToken_ | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB | +| _SharpToken_ | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB | +| _SharpToken_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB | +| | | | | | | | | | | +| _TokenizerLib_ | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB | +| _TokenizerLib_ | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB | +| _TokenizerLib_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB | +| | | | | | | | | | | +| _TiktokenSharp_ | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB | +| _TiktokenSharp_ | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB | +| _TiktokenSharp_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB | ## Performance @@ -315,15 +315,16 @@ It uses modern multibyte CPU instructions and almost no heap allocations. All core methods have been tested on a large and a small input text. **Inputs:** + - `SmallText`: 453 B (text/plain) - `LargeText`: 51 KB (text/html) **Methods:** + - `Encode`: text to tokens - `Decode`: tokens to text - `CountTokens`: high performance API to count tokens of text - ``` BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3296/23H2/2023Update/SunValley3) AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores @@ -334,8 +335,8 @@ AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores .NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256 ``` -| Method | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio | -|------------------------- |--------------:|------------:|------------:|------:|--------:|----------:|------------:| +| Method | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio | +| ------------------------ | ------------: | ----------: | ----------: | ----: | ------: | --------: | ----------: | | **.NET 8.0** | | | | | | | | | Encode_SmallText | 22.649 us | 0.4244 us | 0.4359 us | 0.28 | 0.01 | 696 B | 0.02 | | Encode_LargeText | 4,542.505 us | 87.7988 us | 104.5182 us | 0.24 | 0.01 | 155547 B | 0.03 | diff --git a/SharpToken.Benchmark/SharpToken.Benchmark.csproj b/SharpToken.Benchmark/SharpToken.Benchmark.csproj index dac861b..cbaa71e 100644 --- a/SharpToken.Benchmark/SharpToken.Benchmark.csproj +++ b/SharpToken.Benchmark/SharpToken.Benchmark.csproj @@ -3,6 +3,7 @@ Exe net471;net6.0;net8.0 + net6.0;net8.0 true diff --git a/SharpToken.Tests/SharpToken.Tests.cs b/SharpToken.Tests/SharpToken.Tests.cs index 8c00011..e215a12 100644 --- a/SharpToken.Tests/SharpToken.Tests.cs +++ b/SharpToken.Tests/SharpToken.Tests.cs @@ -1,12 +1,13 @@ using System.Net.Http; using System.Text; +using System.Linq; using NUnit.Framework; namespace SharpToken.Tests; public class Tests { - private static readonly List ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base" }; + private static readonly List ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base", "o200k_harmony" }; private static readonly List>> TestData = TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt"); @@ -23,7 +24,19 @@ public void TestEncodingAndDecoding(Tuple> resource) var (encodingName, textToEncode, expectedEncoded) = resource; var encoding = GptEncoding.GetEncoding(encodingName); - var encoded = encoding.Encode(textToEncode); + + // Detect if the text contains special tokens + var allowedSpecial = new HashSet(); + var specialTokens = GetSpecialTokensForEncoding(encodingName); + foreach (var token in specialTokens) + { + if (textToEncode.Contains(token)) + { + allowedSpecial.Add(token); + } + } + + var encoded = encoding.Encode(textToEncode, allowedSpecial); var decodedText = encoding.Decode(encoded); Assert.Multiple(() => { @@ -39,7 +52,19 @@ public void TestTokensLength(Tuple> resource) var (encodingName, textToEncode, expectedEncoded) = resource; var encoding = GptEncoding.GetEncoding(encodingName); - var tokenLength = encoding.CountTokens(textToEncode); + + // Detect if the text contains special tokens + var allowedSpecial = new HashSet(); + var specialTokens = GetSpecialTokensForEncoding(encodingName); + foreach (var token in specialTokens) + { + if (textToEncode.Contains(token)) + { + allowedSpecial.Add(token); + } + } + + var tokenLength = encoding.CountTokens(textToEncode, allowedSpecial); Assert.Multiple(() => { Assert.That(tokenLength, Is.EqualTo(expectedEncoded.Count)); @@ -53,7 +78,19 @@ public async Task TestEncodingAndDecodingInParallel() { var (encodingName, textToEncode, expectedEncoded) = _; var encoding = GptEncoding.GetEncoding(encodingName); - var encoded = encoding.Encode(textToEncode); + + // Detect if the text contains special tokens + var allowedSpecial = new HashSet(); + var specialTokens = GetSpecialTokensForEncoding(encodingName); + foreach (var token in specialTokens) + { + if (textToEncode.Contains(token)) + { + allowedSpecial.Add(token); + } + } + + var encoded = encoding.Encode(textToEncode, allowedSpecial); var decodedText = encoding.Decode(encoded); return (textToEncode, encoded, expectedEncoded, decodedText); })); @@ -162,6 +199,13 @@ static void TestModelPrefixMappingFailsAction() [TestCaseSource(nameof(ModelsList))] public async Task TestLocalResourceMatchesRemoteResource(string modelName) { + // Skip o200k_harmony as it reuses o200k_base.tiktoken and doesn't have its own remote file + if (modelName == "o200k_harmony") + { + Assert.Pass("o200k_harmony reuses o200k_base.tiktoken file and doesn't have its own remote file"); + return; + } + var embeddedResourceName = $"SharpToken.data.{modelName}.tiktoken"; var remoteResourceUrl = $"https://openaipublic.blob.core.windows.net/encodings/{modelName}.tiktoken"; @@ -199,4 +243,63 @@ public void TestEncodingForModel() Assert.That(decodedText, Is.EqualTo(inputText)); }); } + + [Test] + public void TestO200KHarmonySpecialTokens() + { + var encoding = GptEncoding.GetEncoding("o200k_harmony"); + const string inputText = "Hello, world!"; + + // Test basic encoding/decoding + var encoded = encoding.Encode(inputText); + var decodedText = encoding.Decode(encoded); + Assert.That(decodedText, Is.EqualTo(inputText)); + + // Test that o200k_harmony has more special tokens than o200k_base + var baseEncoding = GptEncoding.GetEncoding("o200k_base"); + + // Test encoding with special tokens + var textWithSpecialTokens = "Hello <|startoftext|> world <|call|> test <|reserved_200020|>"; + var encodedSpecial = encoding.Encode(textWithSpecialTokens, allowedSpecial: new HashSet { "<|startoftext|>", "<|call|>", "<|reserved_200020|>" }); + var decodedSpecial = encoding.Decode(encodedSpecial); + + Assert.That(decodedSpecial, Is.EqualTo(textWithSpecialTokens)); + + // Verify specific special token IDs + Assert.That(encoding.Encode("<|startoftext|>", allowedSpecial: new HashSet { "<|startoftext|>" }), Is.EqualTo(new List { 199998 })); + Assert.That(encoding.Encode("<|call|>", allowedSpecial: new HashSet { "<|call|>" }), Is.EqualTo(new List { 200012 })); + Assert.That(encoding.Encode("<|reserved_200020|>", allowedSpecial: new HashSet { "<|reserved_200020|>" }), Is.EqualTo(new List { 200020 })); + } + + [Test] + public void TestGPT5ModelMappings() + { + // Test that GPT-5 models map to the correct encodings + Assert.That(Model.GetEncodingNameForModel("gpt-5"), Is.EqualTo("o200k_base")); + Assert.That(Model.GetEncodingNameForModel("gpt-5-mini"), Is.EqualTo("o200k_base")); + Assert.That(Model.GetEncodingNameForModel("gpt-5-nano"), Is.EqualTo("o200k_base")); + Assert.That(Model.GetEncodingNameForModel("gpt-5-pro"), Is.EqualTo("o200k_base")); + Assert.That(Model.GetEncodingNameForModel("gpt-5-thinking"), Is.EqualTo("o200k_base")); + + // Test prefix matching for GPT-5 variants + Assert.That(Model.GetEncodingNameForModel("gpt-5-2024-08-07"), Is.EqualTo("o200k_base")); + Assert.That(Model.GetEncodingNameForModel("gpt-5-chat-latest"), Is.EqualTo("o200k_base")); + } + + private static HashSet GetSpecialTokensForEncoding(string encodingName) + { + return encodingName switch + { + "r50k_base" or "p50k_base" => new HashSet { "<|endoftext|>" }, + "p50k_edit" => new HashSet { "<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>" }, + "cl100k_base" => new HashSet { "<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|endofprompt|>" }, + "o200k_base" => new HashSet { "<|endoftext|>", "<|endofprompt|>" }, + "o200k_harmony" => new HashSet(new HashSet + { + "<|endoftext|>", "<|endofprompt|>", "<|startoftext|>", "<|return|>", "<|constrain|>", + "<|channel|>", "<|start|>", "<|end|>", "<|message|>", "<|call|>" + }.Union(Enumerable.Range(200000, 1088).Select(i => $"<|reserved_{i}|>"))), + _ => new HashSet() + }; + } } diff --git a/SharpToken.Tests/SharpToken.Tests.csproj b/SharpToken.Tests/SharpToken.Tests.csproj index a2c9814..80bb383 100644 --- a/SharpToken.Tests/SharpToken.Tests.csproj +++ b/SharpToken.Tests/SharpToken.Tests.csproj @@ -2,6 +2,7 @@ net471;netcoreapp3.1;net6.0;net8.0 + net6.0;net8.0 preview enable enable diff --git a/SharpToken.Tests/data/TestPlanGenerator.py b/SharpToken.Tests/data/TestPlanGenerator.py index c9a1b1b..c6c49d6 100644 --- a/SharpToken.Tests/data/TestPlanGenerator.py +++ b/SharpToken.Tests/data/TestPlanGenerator.py @@ -7,13 +7,84 @@ def read_test_samples(filename): return test_samples +def get_special_token_samples(encoding): + """Generate special token test samples for the given encoding""" + special_token_samples = [] + + # Get all special tokens for this encoding + special_tokens = list(encoding._special_tokens.keys()) + + if not special_tokens: + return special_token_samples + + # Add individual special tokens + for token in special_tokens[:10]: # Limit to first 10 to avoid too many tests + special_token_samples.append(token) + + # Add special tokens with text + if "<|endoftext|>" in special_tokens: + special_token_samples.extend([ + "Hello <|endoftext|> World", + "<|endoftext|>This is a test<|endoftext|>" + ]) + + # Add fill-in-the-middle combinations for encodings that support it + if all(token in special_tokens for token in ["<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>"]): + special_token_samples.extend([ + "<|fim_prefix|>def hello():<|fim_suffix|> print('world')<|fim_middle|>", + "<|fim_prefix|>function test() {<|fim_suffix|>}<|fim_middle|> return true;" + ]) + + # Add endofprompt combinations + if "<|endofprompt|>" in special_tokens: + special_token_samples.extend([ + "User: Hello<|endofprompt|>Assistant: Hi there!", + "Question<|endofprompt|>Answer" + ]) + + # Add o200k_harmony specific combinations + if "<|startoftext|>" in special_tokens and "<|call|>" in special_tokens: + special_token_samples.extend([ + "<|startoftext|>Hello World<|endoftext|>", + "<|call|>function_name<|return|>result", + "<|message|>user<|constrain|>safe<|channel|>text", + "<|start|>conversation<|message|>content<|end|>" + ]) + + # Add some reserved tokens for o200k_harmony + reserved_tokens = [token for token in special_tokens if token.startswith("<|reserved_")] + if reserved_tokens: + special_token_samples.extend([ + reserved_tokens[0], # First reserved token + f"Text with {reserved_tokens[0]} reserved token" if len(reserved_tokens) > 0 else "" + ]) + + return special_token_samples + + def generate_test_plans(test_samples, encodings): test_plans = [] for encoding in encodings: + # Process regular test samples (disallow special tokens) for sample in test_samples: - encoded = encoding.encode(sample, allowed_special={""}) - test_plans.append((encoding.name, sample, encoded)) + if sample.strip(): # Skip empty lines + encoded = encoding.encode(sample, allowed_special=set()) + test_plans.append((encoding.name, sample, encoded)) + + # Process special token samples (allow special tokens) + special_samples = get_special_token_samples(encoding) + for sample in special_samples: + if sample.strip(): # Skip empty samples + try: + # Allow all special tokens for this encoding + all_special_tokens = set(encoding._special_tokens.keys()) + encoded = encoding.encode(sample, allowed_special=all_special_tokens) + test_plans.append((encoding.name, sample, encoded)) + except Exception as e: + # Skip samples that cause encoding errors + print(f"Skipping sample '{sample}' for {encoding.name}: {e}") + continue return test_plans @@ -36,6 +107,7 @@ def save_test_plans(test_plans, filename): tiktoken.get_encoding("p50k_edit"), tiktoken.get_encoding("cl100k_base"), tiktoken.get_encoding("o200k_base"), + tiktoken.get_encoding("o200k_harmony"), ] test_samples = read_test_samples(samples_filename) diff --git a/SharpToken.Tests/data/TestPlans.txt b/SharpToken.Tests/data/TestPlans.txt index c9574e8..a911278 100644 --- a/SharpToken.Tests/data/TestPlans.txt +++ b/SharpToken.Tests/data/TestPlans.txt @@ -1,7 +1,3 @@ -EncodingName: r50k_base -Sample: -Encoded: [] - EncodingName: r50k_base Sample: a Encoded: [64] @@ -258,9 +254,17 @@ EncodingName: r50k_base Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы. Encoded: [140, 240, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 16142, 141, 227, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 118, 16843, 140, 117, 21727, 12, 21727, 20375, 16142, 43666, 18849, 18849, 11, 12466, 118, 25443, 111, 43666, 16142, 12466, 121, 16142, 140, 115, 45035, 38857, 16142, 141, 236, 20375, 21727, 40623, 12466, 122, 43666, 22177, 16142, 12466, 116, 30143, 18849, 12466, 121, 16843, 21727, 31583, 25443, 119, 45367, 31583, 15166, 220, 21727, 20375, 15166, 21169, 15166, 22177, 11, 220, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 16843, 21169, 45367, 16843, 140, 115, 22177, 45035, 141, 227, 12466, 118, 15166, 22177, 141, 226, 30143, 18849, 31583, 20375, 25443, 110, 12466, 116, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 13, 12466, 248, 25443, 120, 140, 123, 16142, 22177, 18849, 18849, 12466, 116, 12466, 116, 141, 227, 12466, 123, 25443, 112, 43666, 16843, 21169, 140, 114, 18849, 38857, 16142, 141, 236, 141, 231, 18849, 16843, 12466, 123, 15166, 21727, 20375, 16142, 38857, 141, 231, 18849, 31583, 18849, 357, 140, 123, 21169, 25443, 111, 21169, 16142, 43108, 43108, 22177, 15166, 16843, 12466, 122, 140, 109, 16843, 21727, 140, 123, 16843, 141, 229, 16843, 22177, 18849, 16843, 11, 12466, 118, 15166, 22177, 21727, 16142, 30143, 20375, 18849, 22177, 140, 111, 8, 12466, 116, 43108, 16843, 141, 236, 20375, 12466, 115, 16142, 18849, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 16142, 22177, 22177, 15166, 21727, 20375, 45367, 12466, 110, 12466, 123, 21169, 16843, 43666, 21727, 20375, 16142, 38857, 30143, 16843, 22177, 18849, 18849, 220, 21169, 16843, 140, 115, 35072, 30143, 45367, 20375, 16142, 20375, 16142, 12466, 110, 12466, 123, 25443, 119, 25443, 114, 18849, 20375, 16843, 30143, 45367, 22177, 25443, 120, 220, 21727, 38857, 16843, 20375, 16843, 13, 12466, 248, 21169, 25443, 120, 16843, 220, 20375, 25443, 111, 15166, 11, 220, 141, 226, 16142, 31583, 20375, 18849, 141, 229, 16843, 21727, 31583, 18849, 16843, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 122, 140, 109, 45035, 141, 229, 22177, 15166, 12466, 123, 25443, 119, 35072, 141, 229, 16142, 141, 236, 20375, 12466, 123, 25443, 119, 45367, 140, 115, 35072, 12466, 116, 30143, 18849, 12466, 123, 15166, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 30143, 35072, 141, 229, 16142, 140, 117, 22177, 45035, 141, 227, 220, 35072, 21727, 30143, 25443, 110, 18849, 140, 117, 11, 12466, 118, 15166, 20375, 15166, 21169, 45035, 16843, 12466, 121, 18849, 31583, 16142, 31583, 12466, 121, 16843, 220, 21727, 38857, 40623, 140, 115, 16142, 22177, 45035, 220, 21727, 12466, 118, 16142, 141, 229, 16843, 21727, 20375, 38857, 25443, 120, 12466, 116, 141, 227, 12466, 116, 21727, 140, 123, 25443, 119, 22177, 16843, 22177, 18849, 40623, 13, 12466, 253, 16843, 21169, 21727, 15166, 22177, 16142, 140, 114, 18849, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 532, 220, 141, 235, 20375, 15166, 12466, 120, 16843, 20375, 25443, 112, 25443, 119, 25443, 111, 18849, 141, 229, 16843, 21727, 31583, 18849, 140, 117, 12466, 122, 20375, 38857, 16843, 20375, 12466, 121, 16142, 220, 141, 235, 20375, 18849, 12466, 123, 21169, 25443, 109, 30143, 16843, 43108, 45035, 13] -EncodingName: p50k_base -Sample: -Encoded: [] +EncodingName: r50k_base +Sample: <|endoftext|> +Encoded: [50256] + +EncodingName: r50k_base +Sample: Hello <|endoftext|> World +Encoded: [15496, 220, 50256, 2159] + +EncodingName: r50k_base +Sample: <|endoftext|>This is a test<|endoftext|> +Encoded: [50256, 1212, 318, 257, 1332, 50256] EncodingName: p50k_base Sample: a @@ -518,9 +522,17 @@ EncodingName: p50k_base Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы. Encoded: [140, 240, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 16142, 141, 227, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 118, 16843, 140, 117, 21727, 12, 21727, 20375, 16142, 43666, 18849, 18849, 11, 12466, 118, 25443, 111, 43666, 16142, 12466, 121, 16142, 140, 115, 45035, 38857, 16142, 141, 236, 20375, 21727, 40623, 12466, 122, 43666, 22177, 16142, 12466, 116, 30143, 18849, 12466, 121, 16843, 21727, 31583, 25443, 119, 45367, 31583, 15166, 220, 21727, 20375, 15166, 21169, 15166, 22177, 11, 220, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 16843, 21169, 45367, 16843, 140, 115, 22177, 45035, 141, 227, 12466, 118, 15166, 22177, 141, 226, 30143, 18849, 31583, 20375, 25443, 110, 12466, 116, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 13, 12466, 248, 25443, 120, 140, 123, 16142, 22177, 18849, 18849, 12466, 116, 12466, 116, 141, 227, 12466, 123, 25443, 112, 43666, 16843, 21169, 140, 114, 18849, 38857, 16142, 141, 236, 141, 231, 18849, 16843, 12466, 123, 15166, 21727, 20375, 16142, 38857, 141, 231, 18849, 31583, 18849, 357, 140, 123, 21169, 25443, 111, 21169, 16142, 43108, 43108, 22177, 15166, 16843, 12466, 122, 140, 109, 16843, 21727, 140, 123, 16843, 141, 229, 16843, 22177, 18849, 16843, 11, 12466, 118, 15166, 22177, 21727, 16142, 30143, 20375, 18849, 22177, 140, 111, 8, 12466, 116, 43108, 16843, 141, 236, 20375, 12466, 115, 16142, 18849, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 16142, 22177, 22177, 15166, 21727, 20375, 45367, 12466, 110, 12466, 123, 21169, 16843, 43666, 21727, 20375, 16142, 38857, 30143, 16843, 22177, 18849, 18849, 220, 21169, 16843, 140, 115, 35072, 30143, 45367, 20375, 16142, 20375, 16142, 12466, 110, 12466, 123, 25443, 119, 25443, 114, 18849, 20375, 16843, 30143, 45367, 22177, 25443, 120, 220, 21727, 38857, 16843, 20375, 16843, 13, 12466, 248, 21169, 25443, 120, 16843, 220, 20375, 25443, 111, 15166, 11, 220, 141, 226, 16142, 31583, 20375, 18849, 141, 229, 16843, 21727, 31583, 18849, 16843, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 122, 140, 109, 45035, 141, 229, 22177, 15166, 12466, 123, 25443, 119, 35072, 141, 229, 16142, 141, 236, 20375, 12466, 123, 25443, 119, 45367, 140, 115, 35072, 12466, 116, 30143, 18849, 12466, 123, 15166, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 30143, 35072, 141, 229, 16142, 140, 117, 22177, 45035, 141, 227, 220, 35072, 21727, 30143, 25443, 110, 18849, 140, 117, 11, 12466, 118, 15166, 20375, 15166, 21169, 45035, 16843, 12466, 121, 18849, 31583, 16142, 31583, 12466, 121, 16843, 220, 21727, 38857, 40623, 140, 115, 16142, 22177, 45035, 220, 21727, 12466, 118, 16142, 141, 229, 16843, 21727, 20375, 38857, 25443, 120, 12466, 116, 141, 227, 12466, 116, 21727, 140, 123, 25443, 119, 22177, 16843, 22177, 18849, 40623, 13, 12466, 253, 16843, 21169, 21727, 15166, 22177, 16142, 140, 114, 18849, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 532, 220, 141, 235, 20375, 15166, 12466, 120, 16843, 20375, 25443, 112, 25443, 119, 25443, 111, 18849, 141, 229, 16843, 21727, 31583, 18849, 140, 117, 12466, 122, 20375, 38857, 16843, 20375, 12466, 121, 16142, 220, 141, 235, 20375, 18849, 12466, 123, 21169, 25443, 109, 30143, 16843, 43108, 45035, 13] -EncodingName: p50k_edit -Sample: -Encoded: [] +EncodingName: p50k_base +Sample: <|endoftext|> +Encoded: [50256] + +EncodingName: p50k_base +Sample: Hello <|endoftext|> World +Encoded: [15496, 220, 50256, 2159] + +EncodingName: p50k_base +Sample: <|endoftext|>This is a test<|endoftext|> +Encoded: [50256, 1212, 318, 257, 1332, 50256] EncodingName: p50k_edit Sample: a @@ -778,9 +790,37 @@ EncodingName: p50k_edit Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы. Encoded: [140, 240, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 16142, 141, 227, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 118, 16843, 140, 117, 21727, 12, 21727, 20375, 16142, 43666, 18849, 18849, 11, 12466, 118, 25443, 111, 43666, 16142, 12466, 121, 16142, 140, 115, 45035, 38857, 16142, 141, 236, 20375, 21727, 40623, 12466, 122, 43666, 22177, 16142, 12466, 116, 30143, 18849, 12466, 121, 16843, 21727, 31583, 25443, 119, 45367, 31583, 15166, 220, 21727, 20375, 15166, 21169, 15166, 22177, 11, 220, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 16843, 21169, 45367, 16843, 140, 115, 22177, 45035, 141, 227, 12466, 118, 15166, 22177, 141, 226, 30143, 18849, 31583, 20375, 25443, 110, 12466, 116, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 13, 12466, 248, 25443, 120, 140, 123, 16142, 22177, 18849, 18849, 12466, 116, 12466, 116, 141, 227, 12466, 123, 25443, 112, 43666, 16843, 21169, 140, 114, 18849, 38857, 16142, 141, 236, 141, 231, 18849, 16843, 12466, 123, 15166, 21727, 20375, 16142, 38857, 141, 231, 18849, 31583, 18849, 357, 140, 123, 21169, 25443, 111, 21169, 16142, 43108, 43108, 22177, 15166, 16843, 12466, 122, 140, 109, 16843, 21727, 140, 123, 16843, 141, 229, 16843, 22177, 18849, 16843, 11, 12466, 118, 15166, 22177, 21727, 16142, 30143, 20375, 18849, 22177, 140, 111, 8, 12466, 116, 43108, 16843, 141, 236, 20375, 12466, 115, 16142, 18849, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 16142, 22177, 22177, 15166, 21727, 20375, 45367, 12466, 110, 12466, 123, 21169, 16843, 43666, 21727, 20375, 16142, 38857, 30143, 16843, 22177, 18849, 18849, 220, 21169, 16843, 140, 115, 35072, 30143, 45367, 20375, 16142, 20375, 16142, 12466, 110, 12466, 123, 25443, 119, 25443, 114, 18849, 20375, 16843, 30143, 45367, 22177, 25443, 120, 220, 21727, 38857, 16843, 20375, 16843, 13, 12466, 248, 21169, 25443, 120, 16843, 220, 20375, 25443, 111, 15166, 11, 220, 141, 226, 16142, 31583, 20375, 18849, 141, 229, 16843, 21727, 31583, 18849, 16843, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 122, 140, 109, 45035, 141, 229, 22177, 15166, 12466, 123, 25443, 119, 35072, 141, 229, 16142, 141, 236, 20375, 12466, 123, 25443, 119, 45367, 140, 115, 35072, 12466, 116, 30143, 18849, 12466, 123, 15166, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 30143, 35072, 141, 229, 16142, 140, 117, 22177, 45035, 141, 227, 220, 35072, 21727, 30143, 25443, 110, 18849, 140, 117, 11, 12466, 118, 15166, 20375, 15166, 21169, 45035, 16843, 12466, 121, 18849, 31583, 16142, 31583, 12466, 121, 16843, 220, 21727, 38857, 40623, 140, 115, 16142, 22177, 45035, 220, 21727, 12466, 118, 16142, 141, 229, 16843, 21727, 20375, 38857, 25443, 120, 12466, 116, 141, 227, 12466, 116, 21727, 140, 123, 25443, 119, 22177, 16843, 22177, 18849, 40623, 13, 12466, 253, 16843, 21169, 21727, 15166, 22177, 16142, 140, 114, 18849, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 532, 220, 141, 235, 20375, 15166, 12466, 120, 16843, 20375, 25443, 112, 25443, 119, 25443, 111, 18849, 141, 229, 16843, 21727, 31583, 18849, 140, 117, 12466, 122, 20375, 38857, 16843, 20375, 12466, 121, 16142, 220, 141, 235, 20375, 18849, 12466, 123, 21169, 25443, 109, 30143, 16843, 43108, 45035, 13] -EncodingName: cl100k_base -Sample: -Encoded: [] +EncodingName: p50k_edit +Sample: <|endoftext|> +Encoded: [50256] + +EncodingName: p50k_edit +Sample: <|fim_prefix|> +Encoded: [50281] + +EncodingName: p50k_edit +Sample: <|fim_middle|> +Encoded: [50282] + +EncodingName: p50k_edit +Sample: <|fim_suffix|> +Encoded: [50283] + +EncodingName: p50k_edit +Sample: Hello <|endoftext|> World +Encoded: [15496, 220, 50256, 2159] + +EncodingName: p50k_edit +Sample: <|endoftext|>This is a test<|endoftext|> +Encoded: [50256, 1212, 318, 257, 1332, 50256] + +EncodingName: p50k_edit +Sample: <|fim_prefix|>def hello():<|fim_suffix|> print('world')<|fim_middle|> +Encoded: [50281, 4299, 23748, 33529, 50283, 50258, 3601, 10786, 6894, 11537, 50282] + +EncodingName: p50k_edit +Sample: <|fim_prefix|>function test() {<|fim_suffix|>}<|fim_middle|> return true; +Encoded: [50281, 8818, 1332, 3419, 1391, 50283, 92, 50282, 50258, 1441, 2081, 26] EncodingName: cl100k_base Sample: a @@ -1038,9 +1078,49 @@ EncodingName: cl100k_base Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы. Encoded: [16604, 39233, 67124, 38438, 13433, 10693, 18154, 68374, 15088, 7820, 21708, 2297, 12, 6735, 23680, 47273, 11, 7820, 14837, 40590, 6850, 19188, 35667, 1506, 70129, 21204, 53419, 14525, 46177, 19175, 66144, 54570, 18868, 9239, 17766, 11, 18868, 11001, 112, 1506, 70129, 20879, 66386, 4929, 32985, 44786, 38098, 19916, 11320, 4898, 1830, 6856, 98051, 23297, 6856, 13, 36479, 12507, 8164, 14332, 1840, 7740, 7740, 10693, 40842, 7094, 58669, 28089, 1506, 90457, 50306, 18154, 68374, 14476, 1840, 17165, 320, 8164, 2233, 69954, 6578, 48150, 21923, 23297, 8164, 56857, 17618, 11, 38098, 2297, 16331, 1830, 19479, 15752, 8, 48835, 1532, 70129, 44946, 19479, 58060, 23297, 6856, 7486, 61379, 5927, 57925, 68374, 3114, 84198, 18600, 71889, 43203, 5927, 29619, 21956, 56008, 94962, 45916, 8341, 1532, 13, 36479, 2233, 12507, 1532, 11047, 22885, 11, 18034, 16248, 23311, 6148, 23297, 17165, 1532, 39233, 67124, 38438, 17165, 18154, 68374, 15088, 21923, 4655, 6148, 13999, 52432, 1506, 70129, 5173, 28647, 3865, 46177, 18154, 6735, 11001, 112, 1506, 70129, 20879, 90226, 19039, 44786, 97991, 6856, 44938, 11, 38153, 51736, 6850, 38822, 16248, 19175, 45916, 64084, 7486, 4655, 5524, 7820, 50223, 35034, 12507, 7740, 10693, 53687, 47680, 18999, 13, 23227, 7753, 2297, 17766, 38657, 1840, 39233, 67124, 38438, 17165, 18154, 68374, 15088, 482, 68979, 56729, 7975, 14837, 81301, 17165, 12415, 93747, 13373, 21599, 23311, 12561, 14082, 37131, 4655, 13] -EncodingName: o200k_base -Sample: -Encoded: [] +EncodingName: cl100k_base +Sample: <|endoftext|> +Encoded: [100257] + +EncodingName: cl100k_base +Sample: <|fim_prefix|> +Encoded: [100258] + +EncodingName: cl100k_base +Sample: <|fim_middle|> +Encoded: [100259] + +EncodingName: cl100k_base +Sample: <|fim_suffix|> +Encoded: [100260] + +EncodingName: cl100k_base +Sample: <|endofprompt|> +Encoded: [100276] + +EncodingName: cl100k_base +Sample: Hello <|endoftext|> World +Encoded: [9906, 220, 100257, 4435] + +EncodingName: cl100k_base +Sample: <|endoftext|>This is a test<|endoftext|> +Encoded: [100257, 2028, 374, 264, 1296, 100257] + +EncodingName: cl100k_base +Sample: <|fim_prefix|>def hello():<|fim_suffix|> print('world')<|fim_middle|> +Encoded: [100258, 755, 24748, 4658, 100260, 262, 1194, 493, 14957, 873, 100259] + +EncodingName: cl100k_base +Sample: <|fim_prefix|>function test() {<|fim_suffix|>}<|fim_middle|> return true; +Encoded: [100258, 1723, 1296, 368, 314, 100260, 92, 100259, 262, 471, 837, 26] + +EncodingName: cl100k_base +Sample: User: Hello<|endofprompt|>Assistant: Hi there! +Encoded: [1502, 25, 22691, 100276, 72803, 25, 21694, 1070, 0] + +EncodingName: cl100k_base +Sample: Question<|endofprompt|>Answer +Encoded: [14924, 100276, 16533] EncodingName: o200k_base Sample: a @@ -1297,3 +1377,364 @@ Encoded: [102415, 237, 102415, 236, 102415, 238, 102415, 232, 102415, 233, 10241 EncodingName: o200k_base Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы. Encoded: [3540, 121615, 4554, 20948, 39823, 2533, 36916, 369, 12, 749, 4260, 2779, 11, 21029, 56615, 10937, 39735, 7388, 31039, 89304, 11, 15356, 72215, 2814, 75580, 3959, 182012, 938, 29740, 938, 13, 130022, 20548, 816, 11229, 39471, 12590, 34491, 39823, 166426, 350, 9501, 162886, 7444, 181078, 11, 52047, 62242, 26951, 8, 59002, 133671, 5705, 8209, 743, 20311, 44678, 192243, 743, 35984, 198122, 4706, 11094, 13, 71337, 19182, 11, 26563, 79261, 13298, 121615, 51074, 39823, 2533, 69654, 172161, 140433, 7388, 15073, 714, 72215, 2814, 74971, 3959, 115462, 11, 16087, 121974, 1967, 26790, 8276, 669, 29268, 35963, 11229, 186377, 13, 60026, 106365, 62928, 121615, 51074, 39823, 2533, 533, 8577, 39009, 12062, 44920, 30168, 1235, 25671, 64524, 13] + +EncodingName: o200k_base +Sample: <|endoftext|> +Encoded: [199999] + +EncodingName: o200k_base +Sample: <|endofprompt|> +Encoded: [200018] + +EncodingName: o200k_base +Sample: Hello <|endoftext|> World +Encoded: [13225, 220, 199999, 5922] + +EncodingName: o200k_base +Sample: <|endoftext|>This is a test<|endoftext|> +Encoded: [199999, 2500, 382, 261, 1746, 199999] + +EncodingName: o200k_base +Sample: User: Hello<|endofprompt|>Assistant: Hi there! +Encoded: [1844, 25, 32949, 200018, 91655, 25, 19260, 1354, 0] + +EncodingName: o200k_base +Sample: Question<|endofprompt|>Answer +Encoded: [15143, 200018, 17045] + +EncodingName: o200k_harmony +Sample: a +Encoded: [64] + +EncodingName: o200k_harmony +Sample: 1 +Encoded: [16] + +EncodingName: o200k_harmony +Sample: a a +Encoded: [64, 261] + +EncodingName: o200k_harmony +Sample: hello +Encoded: [24912] + +EncodingName: o200k_harmony +Sample: Hello, World! How are you today? 🌍 +Encoded: [13225, 11, 5922, 0, 3253, 553, 481, 4044, 30, 130321, 235] + +EncodingName: o200k_harmony +Sample: こんにちは、世界!お元気ですか? +Encoded: [95839, 1395, 28428, 3393, 8930, 6753, 25717, 15121, 7128, 4802] + +EncodingName: o200k_harmony +Sample: Hola, mundo! ¿Cómo estás hoy? 🇪🇸 +Encoded: [49864, 11, 10225, 0, 12873, 46515, 58166, 20502, 30, 173468, 103, 55506, 116] + +EncodingName: o200k_harmony +Sample: Привет, мир! Как дела? +Encoded: [23881, 131903, 11, 37934, 0, 26029, 78857, 30] + +EncodingName: o200k_harmony +Sample: 안녕하세요, 세상! 오늘 기분이 어때요? 🇰🇷 +Encoded: [14307, 171731, 11, 28126, 8612, 0, 106820, 11061, 15567, 2186, 21252, 41856, 7952, 30, 173468, 108, 55506, 115] + +EncodingName: o200k_harmony +Sample: Bonjour, le monde ! Comment ça va aujourd'hui ? 🇫🇷 +Encoded: [45751, 11, 505, 15807, 1073, 15406, 13590, 3423, 32226, 43820, 1423, 173468, 104, 55506, 115] + +EncodingName: o200k_harmony +Sample: The quick brown fox jumps over 13 lazy dogs. 😺 +Encoded: [976, 4853, 19705, 68347, 65613, 1072, 220, 1311, 29082, 16798, 13, 22861, 118] + +EncodingName: o200k_harmony +Sample: 1234567890!@#$%^&*()-=_+[]{};:'",.<>?/|`~ 🎉 +Encoded: [7633, 19354, 29338, 15, 0, 31, 108156, 108254, 5, 9, 31850, 51761, 10, 1951, 12083, 26, 9311, 672, 13, 28052, 153468, 91, 63, 93, 139786, 231] + +EncodingName: o200k_harmony +Sample: C# is a great programming language for building apps. +Encoded: [34, 2, 382, 261, 2212, 23238, 6439, 395, 6282, 12881, 13] + +EncodingName: o200k_harmony +Sample: El área de un triángulo es (base * altura) / 2. +Encoded: [4422, 20482, 334, 537, 12665, 30671, 5953, 878, 350, 5423, 425, 40031, 8, 820, 220, 17, 13] + +EncodingName: o200k_harmony +Sample: Здравствуйте, это мой первый раз здесь. Что мне делать? +Encoded: [182298, 11, 8577, 65733, 62134, 4702, 44039, 13, 53319, 27934, 45321, 30] + +EncodingName: o200k_harmony +Sample: હેલો, વિશ્વ! તમે આજે કેમ છો? 🇮🇳 +Encoded: [6094, 187761, 11, 95706, 0, 52040, 59999, 104493, 72756, 30, 173468, 106, 55506, 111] + +EncodingName: o200k_harmony +Sample: ความรักและการเป็นกันเองเป็นสิ่งสำคัญที่สุดในโลก 🇹🇭 +Encoded: [26224, 151737, 45798, 11855, 34044, 54361, 121316, 34044, 66688, 15177, 75160, 5131, 61134, 81833, 28208, 93469, 173468, 117, 55506, 255] + +EncodingName: o200k_harmony +Sample: Python vs Java: Which programming language should you learn first? +Encoded: [60502, 10217, 13114, 25, 21580, 23238, 6439, 1757, 481, 4484, 1577, 30] + +EncodingName: o200k_harmony +Sample: A journey of a thousand miles begins with a single step. - Lao Tzu +Encoded: [32, 12647, 328, 261, 26791, 10753, 18015, 483, 261, 4590, 5983, 13, 533, 144616, 353, 7846] + +EncodingName: o200k_harmony +Sample: Die Grenzen meiner Sprache bedeuten die Grenzen meiner Welt. 🇩🇪 +Encoded: [8796, 111745, 39103, 89476, 93295, 9627, 1076, 111745, 39103, 23079, 13, 173468, 102, 55506, 103] + +EncodingName: o200k_harmony +Sample: יש לי כמה שאלות בנוגע לפרויקט החדש שלך. 🇮🇱 +Encoded: [7899, 42151, 60962, 129852, 2433, 34083, 110495, 108591, 181894, 162562, 69019, 13, 173468, 106, 55506, 109] + +EncodingName: o200k_harmony +Sample: Det är en vacker dag i Sverige. 🇸🇪 +Encoded: [3639, 7706, 469, 323, 17798, 8724, 575, 64714, 13, 173468, 116, 55506, 103] + +EncodingName: o200k_harmony +Sample: A ∀ x (P(x) → Q(x)) ∧ (∃x P(x)) → ∃x Q(x) +Encoded: [32, 35353, 222, 1215, 350, 47, 4061, 8, 15155, 1486, 4061, 915, 35353, 100, 350, 18085, 225, 87, 398, 4061, 915, 15155, 35353, 225, 87, 1486, 4061, 8] + +EncodingName: o200k_harmony +Sample: O Brasil é o maior país da América do Sul. 🇧🇷 +Encoded: [46, 15278, 1212, 293, 15966, 11106, 1033, 45086, 621, 27109, 13, 173468, 100, 55506, 115] + +EncodingName: o200k_harmony +Sample: L'amore è una forza potente che unisce le persone. 🇮🇹 +Encoded: [43, 30344, 510, 6272, 1969, 125511, 111848, 1378, 537, 48541, 505, 40144, 13, 173468, 106, 55506, 117] + +EncodingName: o200k_harmony +Sample: Είναι μια ηλιόλουστη ημέρα στην Ελλάδα. 🇬🇷 +Encoded: [10303, 16239, 33246, 13115, 57330, 2097, 85087, 42851, 122278, 7648, 21399, 112618, 13, 173468, 105, 55506, 115] + +EncodingName: o200k_harmony +Sample: Teslim tarihi yaklaşıyor, projeyi zamanında bitirmemiz gerekiyor. 🇹🇷 +Encoded: [110176, 5406, 162005, 16000, 148409, 17368, 11, 16022, 33468, 30355, 10884, 3546, 2835, 347, 482, 195151, 13, 173468, 117, 55506, 115] + +EncodingName: o200k_harmony +Sample: Det finnes ingen bedre tid enn nå for å starte noe nytt. 🇳🇴 +Encoded: [3639, 145817, 30430, 56755, 8692, 23075, 19937, 395, 7086, 167203, 49921, 66369, 13, 173468, 111, 55506, 112] + +EncodingName: o200k_harmony +Sample: Aanvaard de uitdagingen van het leven met moed en vastberadenheid. 🇳🇱 +Encoded: [68832, 84482, 334, 180964, 1164, 1448, 21987, 1421, 137256, 469, 11332, 718, 9519, 7157, 13, 173468, 111, 55506, 109] + +EncodingName: o200k_harmony +Sample: Chào mừng bạn đến với thế giới của lập trình. 🇻🇳 +Encoded: [1205, 35134, 284, 75104, 22673, 27528, 18019, 46773, 69217, 12153, 96352, 49051, 13, 173468, 119, 55506, 111] + +EncodingName: o200k_harmony +Sample: Dlaczego warto uczyć się języków obcych? 🇵🇱 +Encoded: [136923, 182265, 82074, 337, 150478, 9721, 140914, 3705, 87043, 1067, 55175, 30, 173468, 113, 55506, 109] + +EncodingName: o200k_harmony +Sample: E = mc², uma equação famosa na física. 🇵🇹 +Encoded: [36, 314, 36958, 13848, 11, 3030, 2801, 3890, 96317, 898, 50251, 13, 173468, 113, 55506, 117] + +EncodingName: o200k_harmony +Sample: 你今天遇到什么有趣的事情了吗?🇨🇳 +Encoded: [12370, 47256, 57127, 6946, 10555, 3666, 57922, 1616, 162913, 112451, 4802, 55506, 101, 55506, 111] + +EncodingName: o200k_harmony +Sample: Nå er det tid for å feire med familie og venner. 🇳🇴 +Encoded: [45, 592, 1111, 1476, 8692, 395, 7086, 1193, 594, 1475, 39603, 2085, 131786, 13, 173468, 111, 55506, 112] + +EncodingName: o200k_harmony +Sample: Þetta er góður dagur til að læra eitthvað nýtt. 🇮🇸 +Encoded: [7860, 20476, 1111, 91455, 17041, 8724, 330, 3453, 5993, 29333, 614, 180350, 49697, 1037, 13, 173468, 106, 55506, 116] + +EncodingName: o200k_harmony +Sample: გამარჯობა! როგორ ხართ დღეს? 🇬🇪 +Encoded: [165502, 69106, 24045, 0, 57298, 10892, 10875, 55856, 30, 173468, 105, 55506, 103] + +EncodingName: o200k_harmony +Sample: Mā te whakawhiti kōrero e whai hua ai tātou. 🇳🇿 +Encoded: [44, 2485, 729, 145047, 174352, 92760, 41643, 319, 101354, 76899, 8440, 260, 36813, 283, 13, 173468, 111, 55506, 123] + +EncodingName: o200k_harmony +Sample: Это был незабываемый опыт, который я буду помнить всегда. +Encoded: [63250, 11066, 37028, 66181, 42684, 6770, 67711, 11, 21903, 3277, 61571, 179329, 34056, 13] + +EncodingName: o200k_harmony +Sample: Διαβάζοντας βιβλία, εμπλουτίζουμε τον εαυτό μας με γνώσεις. +Encoded: [16611, 5690, 63324, 9153, 92025, 164613, 113428, 11, 109925, 85087, 30711, 9153, 33850, 20894, 4278, 727, 75653, 35170, 9173, 8558, 954, 92830, 13] + +EncodingName: o200k_harmony +Sample: A számítástechnika világa tele van izgalmas lehetőségekkel. 🇭🇺 +Encoded: [32, 70578, 5348, 449, 168649, 3113, 11748, 449, 2225, 5443, 1164, 4297, 8298, 4227, 51215, 53922, 95521, 108844, 13, 173468, 255, 55506, 118] + +EncodingName: o200k_harmony +Sample: Vždy je dobré mít plán B, pokud něco nevyjde. 🇨🇿 +Encoded: [53, 99728, 1264, 54560, 377, 98517, 192660, 418, 11, 118907, 134570, 453, 16670, 56244, 13, 173468, 101, 55506, 123] + +EncodingName: o200k_harmony +Sample: Dragostea e un sentiment minunat care ne unește pe toți. 🇷🇴 +Encoded: [25765, 564, 12932, 319, 537, 39160, 182050, 266, 2631, 453, 2463, 74495, 1045, 316, 20660, 13, 173468, 115, 55506, 112] + +EncodingName: o200k_harmony +Sample: دیکھو، آسمان میں کتنی تارے ہیں! 🇵🇰 +Encoded: [547, 55459, 417, 1368, 3382, 11248, 1195, 6431, 144008, 14148, 112711, 1531, 12406, 0, 173468, 113, 55506, 108] + +EncodingName: o200k_harmony +Sample: Nenda polepole na ujifunze kila siku. 🇹🇿 +Encoded: [45, 5968, 25059, 112657, 898, 62112, 366, 119365, 52237, 54647, 13, 173468, 117, 55506, 123] + +EncodingName: o200k_harmony +Sample: Каква е твоята любима храна? 🇧🇬 +Encoded: [29831, 2224, 2404, 70888, 8886, 2734, 13230, 27621, 2442, 73698, 30, 173468, 100, 55506, 105] + +EncodingName: o200k_harmony +Sample: Sträva alltid efter att bli en bättre version av dig själv. +Encoded: [3504, 450, 2873, 63479, 22852, 1927, 27757, 469, 100580, 3926, 1452, 3807, 71554, 13] + +EncodingName: o200k_harmony +Sample: Філософія - це наука про знання. 🇺🇦 +Encoded: [10334, 17058, 107824, 30929, 533, 54543, 1235, 59929, 4964, 41072, 17561, 13, 173468, 118, 55506, 99] + +EncodingName: o200k_harmony +Sample: Το πρόγραμμα αυτό είναι πολύ ενδιαφέρον. 🇬🇷 +Encoded: [63423, 198704, 43845, 17278, 60896, 162904, 171319, 13, 173468, 105, 55506, 115] + +EncodingName: o200k_harmony +Sample: ^$%#*@!&)(_+=}{|:;"?><,~`'-./][ +Encoded: [61, 3, 4, 2, 154736, 0, 5, 168849, 18287, 29124, 91, 175979, 156569, 11, 93, 63, 44302, 2956, 2696] + +EncodingName: o200k_harmony +Sample: 4gH@!0sT*#(9^%$[x{}j+|Yz6;Q]~8 +Encoded: [19, 70, 39, 31, 0, 15, 82, 51, 9, 2, 7, 24, 61, 4, 3, 58, 87, 12083, 73, 10, 91, 56, 89, 21, 26, 48, 60, 93, 23] + +EncodingName: o200k_harmony +Sample: wNb)I<>#:i^P]*cR8ytUx1Q`6O@z/ +Encoded: [86, 67111, 8, 40, 28052, 97210, 72, 61, 47, 18579, 66, 49, 23, 5240, 182325, 16, 48, 63, 21, 46, 31, 89, 14] + +EncodingName: o200k_harmony +Sample: ÄÜö¿¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ +Encoded: [12921, 8858, 573, 11986, 20407, 61242, 18943, 43470, 43625, 41468, 18596, 64259, 19742, 25661, 4244, 74285, 8980, 98049, 6793, 32438, 13848, 45681, 14737, 39621, 69022, 5366, 68284, 84125, 11006, 1924, 43439, 27124, 75174, 11986] + +EncodingName: o200k_harmony +Sample: ƒšŠŒŽƒšŠŒŽƒšŠŒŽƒšŠŒŽƒšŠŒŽƒšŠŒŽ +Encoded: [99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915] + +EncodingName: o200k_harmony +Sample: 5ħÅŸēýïūē$%#^*()_+{[ö&!@#?>|,.<> +Encoded: [20, 5762, 13631, 198355, 6238, 1840, 9954, 7637, 6238, 3, 4, 2, 61, 9, 416, 62, 10, 90, 58, 573, 5, 0, 31, 2, 10730, 91, 26887, 28052] + +EncodingName: o200k_harmony +Sample: 1B4t#%&*()_+dF5g^hJk7LmN0pQrS<>? +Encoded: [16, 33, 19, 83, 2, 4, 5, 9, 416, 62, 10, 67, 37, 20, 70, 61, 71, 41, 74, 22, 196093, 45, 15, 79, 135047, 50, 28052, 30] + +EncodingName: o200k_harmony +Sample: ¬§±²³µ¶·¹ºª«»¦©¯°±!@#$%^&*()_+ +Encoded: [74285, 18596, 32438, 13848, 45681, 39621, 69022, 5366, 84125, 11006, 25661, 4244, 1924, 41468, 19742, 98049, 6793, 32438, 0, 31, 108156, 108254, 5, 9, 416, 62, 10] + +EncodingName: o200k_harmony +Sample: 8mR5*w7^a$!F(0%#J9@X6vZ1)nU3]_Y/ +Encoded: [23, 76, 49, 20, 147727, 22, 61, 64, 3, 0, 37, 7, 15, 4, 2, 41, 24, 31, 55, 21, 85, 57, 16, 143612, 52, 18, 167793, 56, 14] + +EncodingName: o200k_harmony +Sample: 😊😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺️🙂🤗🤔 +Encoded: [102630, 84083, 156437, 41736, 92916, 13865, 225, 13865, 226, 13865, 227, 13865, 228, 72041, 102630, 13865, 233, 13865, 236, 74762, 122588, 13865, 245, 13865, 247, 13865, 248, 155014, 15148, 37459, 50378, 245, 50378, 242] + +EncodingName: o200k_harmony +Sample: 🤨😐😑😶🙄😏😣😥😮🤐😯😪😫😴😌🤓😛😜😝🤤 +Encoded: [50378, 101, 13865, 238, 13865, 239, 13865, 114, 70125, 226, 13865, 237, 13865, 96, 13865, 98, 13865, 106, 50378, 238, 13865, 107, 13865, 103, 13865, 104, 13865, 112, 13865, 234, 50378, 241, 13865, 249, 13865, 250, 13865, 251, 50378, 97] + +EncodingName: o200k_harmony +Sample: 😒😓😔😕🙃🤑😲😷🤒🤕🤢🤧😈👿👹👺💀☠️ +Encoded: [13865, 240, 13865, 241, 13865, 242, 13865, 243, 70125, 225, 4103, 11566, 13865, 110, 13865, 115, 50378, 240, 50378, 243, 50378, 95, 50378, 100, 13865, 230, 28823, 123, 28823, 117, 28823, 118, 31446, 222, 8434, 254, 15148] + +EncodingName: o200k_harmony +Sample: 😾😿🙀😽😼😻🙈🙉🙊👶👦👧👨👩👴👵👨‍⚕️👩‍⚕️ +Encoded: [13865, 122, 13865, 123, 70125, 222, 13865, 121, 13865, 120, 13865, 119, 70125, 230, 70125, 231, 70125, 232, 28823, 114, 28823, 99, 28823, 100, 28823, 101, 28823, 102, 28823, 112, 28823, 113, 28823, 101, 2524, 84396, 243, 15148, 28823, 102, 2524, 84396, 243, 15148] + +EncodingName: o200k_harmony +Sample: 🌞🌝🌚🌛🌜🌙⭐️🌟💫✨🔥💥☄️🌈☀️🌤️⛅️🌥️ +Encoded: [64364, 252, 64364, 251, 64364, 248, 64364, 249, 64364, 250, 64364, 247, 62160, 15148, 64364, 253, 31446, 104, 97375, 96606, 31446, 98, 8434, 226, 15148, 64364, 230, 8434, 222, 15148, 64364, 97, 15148, 158, 249, 227, 15148, 64364, 98, 15148] + +EncodingName: o200k_harmony +Sample: 🍏🍎🍐🍊🍋🍌🍉🍇🍓🍈🍒🍑 +Encoded: [102415, 237, 102415, 236, 102415, 238, 102415, 232, 102415, 233, 102415, 234, 102415, 231, 102415, 229, 102415, 241, 102415, 230, 102415, 240, 102415, 239] + +EncodingName: o200k_harmony +Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы. +Encoded: [3540, 121615, 4554, 20948, 39823, 2533, 36916, 369, 12, 749, 4260, 2779, 11, 21029, 56615, 10937, 39735, 7388, 31039, 89304, 11, 15356, 72215, 2814, 75580, 3959, 182012, 938, 29740, 938, 13, 130022, 20548, 816, 11229, 39471, 12590, 34491, 39823, 166426, 350, 9501, 162886, 7444, 181078, 11, 52047, 62242, 26951, 8, 59002, 133671, 5705, 8209, 743, 20311, 44678, 192243, 743, 35984, 198122, 4706, 11094, 13, 71337, 19182, 11, 26563, 79261, 13298, 121615, 51074, 39823, 2533, 69654, 172161, 140433, 7388, 15073, 714, 72215, 2814, 74971, 3959, 115462, 11, 16087, 121974, 1967, 26790, 8276, 669, 29268, 35963, 11229, 186377, 13, 60026, 106365, 62928, 121615, 51074, 39823, 2533, 533, 8577, 39009, 12062, 44920, 30168, 1235, 25671, 64524, 13] + +EncodingName: o200k_harmony +Sample: <|endoftext|> +Encoded: [199999] + +EncodingName: o200k_harmony +Sample: <|endofprompt|> +Encoded: [200018] + +EncodingName: o200k_harmony +Sample: <|startoftext|> +Encoded: [199998] + +EncodingName: o200k_harmony +Sample: <|reserved_200000|> +Encoded: [200000] + +EncodingName: o200k_harmony +Sample: <|reserved_200001|> +Encoded: [200001] + +EncodingName: o200k_harmony +Sample: <|return|> +Encoded: [200002] + +EncodingName: o200k_harmony +Sample: <|constrain|> +Encoded: [200003] + +EncodingName: o200k_harmony +Sample: <|reserved_200004|> +Encoded: [200004] + +EncodingName: o200k_harmony +Sample: <|channel|> +Encoded: [200005] + +EncodingName: o200k_harmony +Sample: <|start|> +Encoded: [200006] + +EncodingName: o200k_harmony +Sample: Hello <|endoftext|> World +Encoded: [13225, 220, 199999, 5922] + +EncodingName: o200k_harmony +Sample: <|endoftext|>This is a test<|endoftext|> +Encoded: [199999, 2500, 382, 261, 1746, 199999] + +EncodingName: o200k_harmony +Sample: User: Hello<|endofprompt|>Assistant: Hi there! +Encoded: [1844, 25, 32949, 200018, 91655, 25, 19260, 1354, 0] + +EncodingName: o200k_harmony +Sample: Question<|endofprompt|>Answer +Encoded: [15143, 200018, 17045] + +EncodingName: o200k_harmony +Sample: <|startoftext|>Hello World<|endoftext|> +Encoded: [199998, 13225, 5922, 199999] + +EncodingName: o200k_harmony +Sample: <|call|>function_name<|return|>result +Encoded: [200012, 2706, 2483, 200002, 2521] + +EncodingName: o200k_harmony +Sample: <|message|>user<|constrain|>safe<|channel|>text +Encoded: [200008, 1428, 200003, 46891, 200005, 919] + +EncodingName: o200k_harmony +Sample: <|start|>conversation<|message|>content<|end|> +Encoded: [200006, 129279, 200008, 3252, 200007] + +EncodingName: o200k_harmony +Sample: <|reserved_200000|> +Encoded: [200000] + +EncodingName: o200k_harmony +Sample: Text with <|reserved_200000|> reserved token +Encoded: [1279, 483, 220, 200000, 9924, 6602] + diff --git a/SharpToken/Lib/Internals/ModelParamsGenerator.cs b/SharpToken/Lib/Internals/ModelParamsGenerator.cs index e8e090f..15eefc8 100644 --- a/SharpToken/Lib/Internals/ModelParamsGenerator.cs +++ b/SharpToken/Lib/Internals/ModelParamsGenerator.cs @@ -56,6 +56,9 @@ public static ModelParams GetModelParams(string encodingName) case "o200k_base": return O200KBase(); + case "o200k_harmony": + return O200KHarmony(); + default: throw new ArgumentException($"Unknown encoding name: {encodingName}"); } @@ -140,6 +143,56 @@ private static ModelParams O200KBase() specialTokens: specialTokens ); } + + private static ModelParams O200KHarmony() + { + // O200K Harmony reuses the same mergeable ranks as O200K Base but has extended special tokens + var mergeableRanks = EmbeddedResourceReader.LoadTokenBytePairEncoding("SharpToken.data.o200k_base.tiktoken"); + + var specialTokens = new Dictionary + { + // Base O200K special tokens (from o200k_base) + { EndOfText, 199999 }, + { EndOfPrompt, 200018 }, // This will be overwritten by reserved_200018, but allows recognition + + // Additional O200K Harmony special tokens + { "<|startoftext|>", 199998 }, + { "<|reserved_200000|>", 200000 }, + { "<|reserved_200001|>", 200001 }, + { "<|return|>", 200002 }, + { "<|constrain|>", 200003 }, + { "<|reserved_200004|>", 200004 }, + { "<|channel|>", 200005 }, + { "<|start|>", 200006 }, + { "<|end|>", 200007 }, + { "<|message|>", 200008 }, + { "<|reserved_200009|>", 200009 }, + { "<|reserved_200010|>", 200010 }, + { "<|reserved_200011|>", 200011 }, + { "<|call|>", 200012 } + }; + + // Add reserved tokens from 200013 to 201087 + // Note: We skip 200018 to avoid duplicate key with EndOfPrompt + for (int i = 200013; i < 201088; i++) + { + if (i == 200018) + { + // Skip 200018 to avoid duplicate key exception + // Both <|endofprompt|> and <|reserved_200018|> would map to 200018 + // We keep <|endofprompt|> for compatibility + continue; + } + specialTokens[$"<|reserved_{i}|>"] = i; + } + + return new ModelParams + ( + tokenizerRegex: ModelParamsGeneratorRegex.RegexO200KBase(), + mergeableRanks: mergeableRanks, + specialTokens: specialTokens + ); + } } internal sealed partial class ModelParamsGeneratorRegex diff --git a/SharpToken/Lib/Model.cs b/SharpToken/Lib/Model.cs index 2d0f75f..99b2fee 100644 --- a/SharpToken/Lib/Model.cs +++ b/SharpToken/Lib/Model.cs @@ -9,6 +9,11 @@ public static class Model { // chat { "gpt-4o", "o200k_base" }, + { "gpt-5", "o200k_base" }, + { "gpt-5-mini", "o200k_base" }, + { "gpt-5-nano", "o200k_base" }, + { "gpt-5-pro", "o200k_base" }, + { "gpt-5-thinking", "o200k_base" }, { "gpt-4", "cl100k_base" }, { "gpt-3.5-turbo-16k", "cl100k_base" }, { "gpt-35-turbo-16k", "cl100k_base" }, // Azure deployment name @@ -54,6 +59,7 @@ public static class Model private static readonly Dictionary ModelPrefixToEncodingMapping = new Dictionary { + { "gpt-5", "o200k_base" }, // e.g., gpt-5-2024-08-07, gpt-5-chat-latest, etc. { "gpt-4o", "o200k_base" }, // (NOTE: no trailing dash, on purpose). E.g., gpt-4o, gpt-4o-2024-05-13, etc., { "gpt-4-", "cl100k_base" }, // e.g., gpt-4-0314, etc., plus gpt-4-32k { "gpt-3.5-turbo-", "cl100k_base" }, // e.g, gpt-3.5-turbo-0301, -0401, etc.