diff --git a/README.md b/README.md
index 5ca28e1..7c4b6be 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,6 @@
[](https://github.com/dmitry-brazhenko/SharpToken/graphs/contributors)
[](LICENSE)
-
-
SharpToken is a C# library that serves as a port of the Python [tiktoken](https://github.com/openai/tiktoken) library.
It provides functionality for encoding and decoding tokens using GPT-based encodings. This library is built for .NET 6, .NET 8
and .NET Standard 2.0, making it compatible with a wide range of frameworks.
@@ -74,11 +72,12 @@ var count = encoding.CountTokens("Hello, world!"); // Output: 4
SharpToken currently supports the following models:
-* `r50k_base`
-* `p50k_base`
-* `p50k_edit`
-* `cl100k_base`
-* `o200k_base`
+- `r50k_base`
+- `p50k_base`
+- `p50k_edit`
+- `cl100k_base`
+- `o200k_base`
+- `o200k_harmony`
You can use any of these models when creating an instance of GptEncoding:
@@ -88,6 +87,7 @@ var p50kBaseEncoding = GptEncoding.GetEncoding("p50k_base");
var p50kEditEncoding = GptEncoding.GetEncoding("p50k_edit");
var cl100kBaseEncoding = GptEncoding.GetEncoding("cl100k_base");
var o200kBaseEncoding = GptEncoding.GetEncoding("o200k_base");
+var o200kHarmonyEncoding = GptEncoding.GetEncoding("o200k_harmony");
```
### Model Prefix Matching
@@ -96,14 +96,17 @@ Apart from specifying direct model names, SharpToken also provides functionality
Here are the current supported prefixes and their corresponding encodings:
-| Model Prefix | Encoding |
-|---------------------|------------|
-| `gpt-4o` | `o200k_base` |
-| `gpt-4-` | `cl100k_base` |
-| `gpt-3.5-turbo-` | `cl100k_base` |
-| `gpt-35-turbo` | `cl100k_base` |
+| Model Prefix | Encoding |
+| ---------------- | ------------- |
+| `gpt-5` | `o200k_base` |
+| `gpt-4o` | `o200k_base` |
+| `gpt-4-` | `cl100k_base` |
+| `gpt-3.5-turbo-` | `cl100k_base` |
+| `gpt-35-turbo` | `cl100k_base` |
Examples of model names that fall under these prefixes include:
+
+- For the prefix `gpt-5`: `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-pro`, `gpt-5-thinking`, `gpt-5-2024-08-07`, `gpt-5-chat-latest`, etc.
- For the prefix `gpt-4o`: `gpt-4o`, `gpt-4o-2024-05-13`, etc.
- For the prefix `gpt-4-`: `gpt-4-0314`, `gpt-4-32k`, etc.
- For the prefix `gpt-3.5-turbo-`: `gpt-3.5-turbo-0301`, `gpt-3.5-turbo-0401`, etc.
@@ -117,9 +120,6 @@ string encodingName = Model.GetEncodingNameForModel("gpt-4-0314"); // This will
If the provided model name doesn't match any direct model names or prefixes, the method will return `null`.
-
-
-
## Understanding Encoded Values
When you encode a string using the Encode method, the returned value is a list of integers that represent tokens in the
@@ -289,23 +289,23 @@ BenchmarkDotNet v0.13.9+228a464e8be6c580ad9408e98f18813f6407fb5a, Windows 11 (10
.NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256
```
-| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated |
-|------------------ |--------------------- |--------------------- |----------:|---------:|----------:|----------:|-----------:|----------:|----------:|
-| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB |
-| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB |
-| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB |
-| | | | | | | | | | |
-| *SharpToken* | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB |
-| *SharpToken* | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB |
-| *SharpToken* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB |
-| | | | | | | | | | |
-| *TokenizerLib* | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB |
-| *TokenizerLib* | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB |
-| *TokenizerLib* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB |
-| | | | | | | | | | |
-| *TiktokenSharp* | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB |
-| *TiktokenSharp* | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB |
-| *TiktokenSharp* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB |
+| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated |
+| ---------------- | -------------------- | -------------------- | --------: | -------: | --------: | --------: | ---------: | --------: | --------: |
+| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB |
+| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB |
+| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB |
+| | | | | | | | | | |
+| _SharpToken_ | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB |
+| _SharpToken_ | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB |
+| _SharpToken_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB |
+| | | | | | | | | | |
+| _TokenizerLib_ | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB |
+| _TokenizerLib_ | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB |
+| _TokenizerLib_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB |
+| | | | | | | | | | |
+| _TiktokenSharp_ | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB |
+| _TiktokenSharp_ | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB |
+| _TiktokenSharp_ | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB |
## Performance
@@ -315,15 +315,16 @@ It uses modern multibyte CPU instructions and almost no heap allocations.
All core methods have been tested on a large and a small input text.
**Inputs:**
+
- `SmallText`: 453 B (text/plain)
- `LargeText`: 51 KB (text/html)
**Methods:**
+
- `Encode`: text to tokens
- `Decode`: tokens to text
- `CountTokens`: high performance API to count tokens of text
-
```
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3296/23H2/2023Update/SunValley3)
AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores
@@ -334,8 +335,8 @@ AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores
.NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256
```
-| Method | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio |
-|------------------------- |--------------:|------------:|------------:|------:|--------:|----------:|------------:|
+| Method | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio |
+| ------------------------ | ------------: | ----------: | ----------: | ----: | ------: | --------: | ----------: |
| **.NET 8.0** | | | | | | | |
| Encode_SmallText | 22.649 us | 0.4244 us | 0.4359 us | 0.28 | 0.01 | 696 B | 0.02 |
| Encode_LargeText | 4,542.505 us | 87.7988 us | 104.5182 us | 0.24 | 0.01 | 155547 B | 0.03 |
diff --git a/SharpToken.Benchmark/SharpToken.Benchmark.csproj b/SharpToken.Benchmark/SharpToken.Benchmark.csproj
index dac861b..cbaa71e 100644
--- a/SharpToken.Benchmark/SharpToken.Benchmark.csproj
+++ b/SharpToken.Benchmark/SharpToken.Benchmark.csproj
@@ -3,6 +3,7 @@
Exe
net471;net6.0;net8.0
+ net6.0;net8.0
true
diff --git a/SharpToken.Tests/SharpToken.Tests.cs b/SharpToken.Tests/SharpToken.Tests.cs
index 8c00011..e215a12 100644
--- a/SharpToken.Tests/SharpToken.Tests.cs
+++ b/SharpToken.Tests/SharpToken.Tests.cs
@@ -1,12 +1,13 @@
using System.Net.Http;
using System.Text;
+using System.Linq;
using NUnit.Framework;
namespace SharpToken.Tests;
public class Tests
{
- private static readonly List ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base" };
+ private static readonly List ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base", "o200k_harmony" };
private static readonly List>> TestData =
TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt");
@@ -23,7 +24,19 @@ public void TestEncodingAndDecoding(Tuple> resource)
var (encodingName, textToEncode, expectedEncoded) = resource;
var encoding = GptEncoding.GetEncoding(encodingName);
- var encoded = encoding.Encode(textToEncode);
+
+ // Detect if the text contains special tokens
+ var allowedSpecial = new HashSet();
+ var specialTokens = GetSpecialTokensForEncoding(encodingName);
+ foreach (var token in specialTokens)
+ {
+ if (textToEncode.Contains(token))
+ {
+ allowedSpecial.Add(token);
+ }
+ }
+
+ var encoded = encoding.Encode(textToEncode, allowedSpecial);
var decodedText = encoding.Decode(encoded);
Assert.Multiple(() =>
{
@@ -39,7 +52,19 @@ public void TestTokensLength(Tuple> resource)
var (encodingName, textToEncode, expectedEncoded) = resource;
var encoding = GptEncoding.GetEncoding(encodingName);
- var tokenLength = encoding.CountTokens(textToEncode);
+
+ // Detect if the text contains special tokens
+ var allowedSpecial = new HashSet();
+ var specialTokens = GetSpecialTokensForEncoding(encodingName);
+ foreach (var token in specialTokens)
+ {
+ if (textToEncode.Contains(token))
+ {
+ allowedSpecial.Add(token);
+ }
+ }
+
+ var tokenLength = encoding.CountTokens(textToEncode, allowedSpecial);
Assert.Multiple(() =>
{
Assert.That(tokenLength, Is.EqualTo(expectedEncoded.Count));
@@ -53,7 +78,19 @@ public async Task TestEncodingAndDecodingInParallel()
{
var (encodingName, textToEncode, expectedEncoded) = _;
var encoding = GptEncoding.GetEncoding(encodingName);
- var encoded = encoding.Encode(textToEncode);
+
+ // Detect if the text contains special tokens
+ var allowedSpecial = new HashSet();
+ var specialTokens = GetSpecialTokensForEncoding(encodingName);
+ foreach (var token in specialTokens)
+ {
+ if (textToEncode.Contains(token))
+ {
+ allowedSpecial.Add(token);
+ }
+ }
+
+ var encoded = encoding.Encode(textToEncode, allowedSpecial);
var decodedText = encoding.Decode(encoded);
return (textToEncode, encoded, expectedEncoded, decodedText);
}));
@@ -162,6 +199,13 @@ static void TestModelPrefixMappingFailsAction()
[TestCaseSource(nameof(ModelsList))]
public async Task TestLocalResourceMatchesRemoteResource(string modelName)
{
+ // Skip o200k_harmony as it reuses o200k_base.tiktoken and doesn't have its own remote file
+ if (modelName == "o200k_harmony")
+ {
+ Assert.Pass("o200k_harmony reuses o200k_base.tiktoken file and doesn't have its own remote file");
+ return;
+ }
+
var embeddedResourceName = $"SharpToken.data.{modelName}.tiktoken";
var remoteResourceUrl = $"https://openaipublic.blob.core.windows.net/encodings/{modelName}.tiktoken";
@@ -199,4 +243,63 @@ public void TestEncodingForModel()
Assert.That(decodedText, Is.EqualTo(inputText));
});
}
+
+ [Test]
+ public void TestO200KHarmonySpecialTokens()
+ {
+ var encoding = GptEncoding.GetEncoding("o200k_harmony");
+ const string inputText = "Hello, world!";
+
+ // Test basic encoding/decoding
+ var encoded = encoding.Encode(inputText);
+ var decodedText = encoding.Decode(encoded);
+ Assert.That(decodedText, Is.EqualTo(inputText));
+
+ // Test that o200k_harmony has more special tokens than o200k_base
+ var baseEncoding = GptEncoding.GetEncoding("o200k_base");
+
+ // Test encoding with special tokens
+ var textWithSpecialTokens = "Hello <|startoftext|> world <|call|> test <|reserved_200020|>";
+ var encodedSpecial = encoding.Encode(textWithSpecialTokens, allowedSpecial: new HashSet { "<|startoftext|>", "<|call|>", "<|reserved_200020|>" });
+ var decodedSpecial = encoding.Decode(encodedSpecial);
+
+ Assert.That(decodedSpecial, Is.EqualTo(textWithSpecialTokens));
+
+ // Verify specific special token IDs
+ Assert.That(encoding.Encode("<|startoftext|>", allowedSpecial: new HashSet { "<|startoftext|>" }), Is.EqualTo(new List { 199998 }));
+ Assert.That(encoding.Encode("<|call|>", allowedSpecial: new HashSet { "<|call|>" }), Is.EqualTo(new List { 200012 }));
+ Assert.That(encoding.Encode("<|reserved_200020|>", allowedSpecial: new HashSet { "<|reserved_200020|>" }), Is.EqualTo(new List { 200020 }));
+ }
+
+ [Test]
+ public void TestGPT5ModelMappings()
+ {
+ // Test that GPT-5 models map to the correct encodings
+ Assert.That(Model.GetEncodingNameForModel("gpt-5"), Is.EqualTo("o200k_base"));
+ Assert.That(Model.GetEncodingNameForModel("gpt-5-mini"), Is.EqualTo("o200k_base"));
+ Assert.That(Model.GetEncodingNameForModel("gpt-5-nano"), Is.EqualTo("o200k_base"));
+ Assert.That(Model.GetEncodingNameForModel("gpt-5-pro"), Is.EqualTo("o200k_base"));
+ Assert.That(Model.GetEncodingNameForModel("gpt-5-thinking"), Is.EqualTo("o200k_base"));
+
+ // Test prefix matching for GPT-5 variants
+ Assert.That(Model.GetEncodingNameForModel("gpt-5-2024-08-07"), Is.EqualTo("o200k_base"));
+ Assert.That(Model.GetEncodingNameForModel("gpt-5-chat-latest"), Is.EqualTo("o200k_base"));
+ }
+
+ private static HashSet GetSpecialTokensForEncoding(string encodingName)
+ {
+ return encodingName switch
+ {
+ "r50k_base" or "p50k_base" => new HashSet { "<|endoftext|>" },
+ "p50k_edit" => new HashSet { "<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>" },
+ "cl100k_base" => new HashSet { "<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|endofprompt|>" },
+ "o200k_base" => new HashSet { "<|endoftext|>", "<|endofprompt|>" },
+ "o200k_harmony" => new HashSet(new HashSet
+ {
+ "<|endoftext|>", "<|endofprompt|>", "<|startoftext|>", "<|return|>", "<|constrain|>",
+ "<|channel|>", "<|start|>", "<|end|>", "<|message|>", "<|call|>"
+ }.Union(Enumerable.Range(200000, 1088).Select(i => $"<|reserved_{i}|>"))),
+ _ => new HashSet()
+ };
+ }
}
diff --git a/SharpToken.Tests/SharpToken.Tests.csproj b/SharpToken.Tests/SharpToken.Tests.csproj
index a2c9814..80bb383 100644
--- a/SharpToken.Tests/SharpToken.Tests.csproj
+++ b/SharpToken.Tests/SharpToken.Tests.csproj
@@ -2,6 +2,7 @@
net471;netcoreapp3.1;net6.0;net8.0
+ net6.0;net8.0
preview
enable
enable
diff --git a/SharpToken.Tests/data/TestPlanGenerator.py b/SharpToken.Tests/data/TestPlanGenerator.py
index c9a1b1b..c6c49d6 100644
--- a/SharpToken.Tests/data/TestPlanGenerator.py
+++ b/SharpToken.Tests/data/TestPlanGenerator.py
@@ -7,13 +7,84 @@ def read_test_samples(filename):
return test_samples
+def get_special_token_samples(encoding):
+ """Generate special token test samples for the given encoding"""
+ special_token_samples = []
+
+ # Get all special tokens for this encoding
+ special_tokens = list(encoding._special_tokens.keys())
+
+ if not special_tokens:
+ return special_token_samples
+
+ # Add individual special tokens
+ for token in special_tokens[:10]: # Limit to first 10 to avoid too many tests
+ special_token_samples.append(token)
+
+ # Add special tokens with text
+ if "<|endoftext|>" in special_tokens:
+ special_token_samples.extend([
+ "Hello <|endoftext|> World",
+ "<|endoftext|>This is a test<|endoftext|>"
+ ])
+
+ # Add fill-in-the-middle combinations for encodings that support it
+ if all(token in special_tokens for token in ["<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>"]):
+ special_token_samples.extend([
+ "<|fim_prefix|>def hello():<|fim_suffix|> print('world')<|fim_middle|>",
+ "<|fim_prefix|>function test() {<|fim_suffix|>}<|fim_middle|> return true;"
+ ])
+
+ # Add endofprompt combinations
+ if "<|endofprompt|>" in special_tokens:
+ special_token_samples.extend([
+ "User: Hello<|endofprompt|>Assistant: Hi there!",
+ "Question<|endofprompt|>Answer"
+ ])
+
+ # Add o200k_harmony specific combinations
+ if "<|startoftext|>" in special_tokens and "<|call|>" in special_tokens:
+ special_token_samples.extend([
+ "<|startoftext|>Hello World<|endoftext|>",
+ "<|call|>function_name<|return|>result",
+ "<|message|>user<|constrain|>safe<|channel|>text",
+ "<|start|>conversation<|message|>content<|end|>"
+ ])
+
+ # Add some reserved tokens for o200k_harmony
+ reserved_tokens = [token for token in special_tokens if token.startswith("<|reserved_")]
+ if reserved_tokens:
+ special_token_samples.extend([
+ reserved_tokens[0], # First reserved token
+ f"Text with {reserved_tokens[0]} reserved token" if len(reserved_tokens) > 0 else ""
+ ])
+
+ return special_token_samples
+
+
def generate_test_plans(test_samples, encodings):
test_plans = []
for encoding in encodings:
+ # Process regular test samples (disallow special tokens)
for sample in test_samples:
- encoded = encoding.encode(sample, allowed_special={""})
- test_plans.append((encoding.name, sample, encoded))
+ if sample.strip(): # Skip empty lines
+ encoded = encoding.encode(sample, allowed_special=set())
+ test_plans.append((encoding.name, sample, encoded))
+
+ # Process special token samples (allow special tokens)
+ special_samples = get_special_token_samples(encoding)
+ for sample in special_samples:
+ if sample.strip(): # Skip empty samples
+ try:
+ # Allow all special tokens for this encoding
+ all_special_tokens = set(encoding._special_tokens.keys())
+ encoded = encoding.encode(sample, allowed_special=all_special_tokens)
+ test_plans.append((encoding.name, sample, encoded))
+ except Exception as e:
+ # Skip samples that cause encoding errors
+ print(f"Skipping sample '{sample}' for {encoding.name}: {e}")
+ continue
return test_plans
@@ -36,6 +107,7 @@ def save_test_plans(test_plans, filename):
tiktoken.get_encoding("p50k_edit"),
tiktoken.get_encoding("cl100k_base"),
tiktoken.get_encoding("o200k_base"),
+ tiktoken.get_encoding("o200k_harmony"),
]
test_samples = read_test_samples(samples_filename)
diff --git a/SharpToken.Tests/data/TestPlans.txt b/SharpToken.Tests/data/TestPlans.txt
index c9574e8..a911278 100644
--- a/SharpToken.Tests/data/TestPlans.txt
+++ b/SharpToken.Tests/data/TestPlans.txt
@@ -1,7 +1,3 @@
-EncodingName: r50k_base
-Sample:
-Encoded: []
-
EncodingName: r50k_base
Sample: a
Encoded: [64]
@@ -258,9 +254,17 @@ EncodingName: r50k_base
Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы.
Encoded: [140, 240, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 16142, 141, 227, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 118, 16843, 140, 117, 21727, 12, 21727, 20375, 16142, 43666, 18849, 18849, 11, 12466, 118, 25443, 111, 43666, 16142, 12466, 121, 16142, 140, 115, 45035, 38857, 16142, 141, 236, 20375, 21727, 40623, 12466, 122, 43666, 22177, 16142, 12466, 116, 30143, 18849, 12466, 121, 16843, 21727, 31583, 25443, 119, 45367, 31583, 15166, 220, 21727, 20375, 15166, 21169, 15166, 22177, 11, 220, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 16843, 21169, 45367, 16843, 140, 115, 22177, 45035, 141, 227, 12466, 118, 15166, 22177, 141, 226, 30143, 18849, 31583, 20375, 25443, 110, 12466, 116, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 13, 12466, 248, 25443, 120, 140, 123, 16142, 22177, 18849, 18849, 12466, 116, 12466, 116, 141, 227, 12466, 123, 25443, 112, 43666, 16843, 21169, 140, 114, 18849, 38857, 16142, 141, 236, 141, 231, 18849, 16843, 12466, 123, 15166, 21727, 20375, 16142, 38857, 141, 231, 18849, 31583, 18849, 357, 140, 123, 21169, 25443, 111, 21169, 16142, 43108, 43108, 22177, 15166, 16843, 12466, 122, 140, 109, 16843, 21727, 140, 123, 16843, 141, 229, 16843, 22177, 18849, 16843, 11, 12466, 118, 15166, 22177, 21727, 16142, 30143, 20375, 18849, 22177, 140, 111, 8, 12466, 116, 43108, 16843, 141, 236, 20375, 12466, 115, 16142, 18849, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 16142, 22177, 22177, 15166, 21727, 20375, 45367, 12466, 110, 12466, 123, 21169, 16843, 43666, 21727, 20375, 16142, 38857, 30143, 16843, 22177, 18849, 18849, 220, 21169, 16843, 140, 115, 35072, 30143, 45367, 20375, 16142, 20375, 16142, 12466, 110, 12466, 123, 25443, 119, 25443, 114, 18849, 20375, 16843, 30143, 45367, 22177, 25443, 120, 220, 21727, 38857, 16843, 20375, 16843, 13, 12466, 248, 21169, 25443, 120, 16843, 220, 20375, 25443, 111, 15166, 11, 220, 141, 226, 16142, 31583, 20375, 18849, 141, 229, 16843, 21727, 31583, 18849, 16843, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 122, 140, 109, 45035, 141, 229, 22177, 15166, 12466, 123, 25443, 119, 35072, 141, 229, 16142, 141, 236, 20375, 12466, 123, 25443, 119, 45367, 140, 115, 35072, 12466, 116, 30143, 18849, 12466, 123, 15166, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 30143, 35072, 141, 229, 16142, 140, 117, 22177, 45035, 141, 227, 220, 35072, 21727, 30143, 25443, 110, 18849, 140, 117, 11, 12466, 118, 15166, 20375, 15166, 21169, 45035, 16843, 12466, 121, 18849, 31583, 16142, 31583, 12466, 121, 16843, 220, 21727, 38857, 40623, 140, 115, 16142, 22177, 45035, 220, 21727, 12466, 118, 16142, 141, 229, 16843, 21727, 20375, 38857, 25443, 120, 12466, 116, 141, 227, 12466, 116, 21727, 140, 123, 25443, 119, 22177, 16843, 22177, 18849, 40623, 13, 12466, 253, 16843, 21169, 21727, 15166, 22177, 16142, 140, 114, 18849, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 532, 220, 141, 235, 20375, 15166, 12466, 120, 16843, 20375, 25443, 112, 25443, 119, 25443, 111, 18849, 141, 229, 16843, 21727, 31583, 18849, 140, 117, 12466, 122, 20375, 38857, 16843, 20375, 12466, 121, 16142, 220, 141, 235, 20375, 18849, 12466, 123, 21169, 25443, 109, 30143, 16843, 43108, 45035, 13]
-EncodingName: p50k_base
-Sample:
-Encoded: []
+EncodingName: r50k_base
+Sample: <|endoftext|>
+Encoded: [50256]
+
+EncodingName: r50k_base
+Sample: Hello <|endoftext|> World
+Encoded: [15496, 220, 50256, 2159]
+
+EncodingName: r50k_base
+Sample: <|endoftext|>This is a test<|endoftext|>
+Encoded: [50256, 1212, 318, 257, 1332, 50256]
EncodingName: p50k_base
Sample: a
@@ -518,9 +522,17 @@ EncodingName: p50k_base
Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы.
Encoded: [140, 240, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 16142, 141, 227, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 118, 16843, 140, 117, 21727, 12, 21727, 20375, 16142, 43666, 18849, 18849, 11, 12466, 118, 25443, 111, 43666, 16142, 12466, 121, 16142, 140, 115, 45035, 38857, 16142, 141, 236, 20375, 21727, 40623, 12466, 122, 43666, 22177, 16142, 12466, 116, 30143, 18849, 12466, 121, 16843, 21727, 31583, 25443, 119, 45367, 31583, 15166, 220, 21727, 20375, 15166, 21169, 15166, 22177, 11, 220, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 16843, 21169, 45367, 16843, 140, 115, 22177, 45035, 141, 227, 12466, 118, 15166, 22177, 141, 226, 30143, 18849, 31583, 20375, 25443, 110, 12466, 116, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 13, 12466, 248, 25443, 120, 140, 123, 16142, 22177, 18849, 18849, 12466, 116, 12466, 116, 141, 227, 12466, 123, 25443, 112, 43666, 16843, 21169, 140, 114, 18849, 38857, 16142, 141, 236, 141, 231, 18849, 16843, 12466, 123, 15166, 21727, 20375, 16142, 38857, 141, 231, 18849, 31583, 18849, 357, 140, 123, 21169, 25443, 111, 21169, 16142, 43108, 43108, 22177, 15166, 16843, 12466, 122, 140, 109, 16843, 21727, 140, 123, 16843, 141, 229, 16843, 22177, 18849, 16843, 11, 12466, 118, 15166, 22177, 21727, 16142, 30143, 20375, 18849, 22177, 140, 111, 8, 12466, 116, 43108, 16843, 141, 236, 20375, 12466, 115, 16142, 18849, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 16142, 22177, 22177, 15166, 21727, 20375, 45367, 12466, 110, 12466, 123, 21169, 16843, 43666, 21727, 20375, 16142, 38857, 30143, 16843, 22177, 18849, 18849, 220, 21169, 16843, 140, 115, 35072, 30143, 45367, 20375, 16142, 20375, 16142, 12466, 110, 12466, 123, 25443, 119, 25443, 114, 18849, 20375, 16843, 30143, 45367, 22177, 25443, 120, 220, 21727, 38857, 16843, 20375, 16843, 13, 12466, 248, 21169, 25443, 120, 16843, 220, 20375, 25443, 111, 15166, 11, 220, 141, 226, 16142, 31583, 20375, 18849, 141, 229, 16843, 21727, 31583, 18849, 16843, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 122, 140, 109, 45035, 141, 229, 22177, 15166, 12466, 123, 25443, 119, 35072, 141, 229, 16142, 141, 236, 20375, 12466, 123, 25443, 119, 45367, 140, 115, 35072, 12466, 116, 30143, 18849, 12466, 123, 15166, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 30143, 35072, 141, 229, 16142, 140, 117, 22177, 45035, 141, 227, 220, 35072, 21727, 30143, 25443, 110, 18849, 140, 117, 11, 12466, 118, 15166, 20375, 15166, 21169, 45035, 16843, 12466, 121, 18849, 31583, 16142, 31583, 12466, 121, 16843, 220, 21727, 38857, 40623, 140, 115, 16142, 22177, 45035, 220, 21727, 12466, 118, 16142, 141, 229, 16843, 21727, 20375, 38857, 25443, 120, 12466, 116, 141, 227, 12466, 116, 21727, 140, 123, 25443, 119, 22177, 16843, 22177, 18849, 40623, 13, 12466, 253, 16843, 21169, 21727, 15166, 22177, 16142, 140, 114, 18849, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 532, 220, 141, 235, 20375, 15166, 12466, 120, 16843, 20375, 25443, 112, 25443, 119, 25443, 111, 18849, 141, 229, 16843, 21727, 31583, 18849, 140, 117, 12466, 122, 20375, 38857, 16843, 20375, 12466, 121, 16142, 220, 141, 235, 20375, 18849, 12466, 123, 21169, 25443, 109, 30143, 16843, 43108, 45035, 13]
-EncodingName: p50k_edit
-Sample:
-Encoded: []
+EncodingName: p50k_base
+Sample: <|endoftext|>
+Encoded: [50256]
+
+EncodingName: p50k_base
+Sample: Hello <|endoftext|> World
+Encoded: [15496, 220, 50256, 2159]
+
+EncodingName: p50k_base
+Sample: <|endoftext|>This is a test<|endoftext|>
+Encoded: [50256, 1212, 318, 257, 1332, 50256]
EncodingName: p50k_edit
Sample: a
@@ -778,9 +790,37 @@ EncodingName: p50k_edit
Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы.
Encoded: [140, 240, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 16142, 141, 227, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 118, 16843, 140, 117, 21727, 12, 21727, 20375, 16142, 43666, 18849, 18849, 11, 12466, 118, 25443, 111, 43666, 16142, 12466, 121, 16142, 140, 115, 45035, 38857, 16142, 141, 236, 20375, 21727, 40623, 12466, 122, 43666, 22177, 16142, 12466, 116, 30143, 18849, 12466, 121, 16843, 21727, 31583, 25443, 119, 45367, 31583, 15166, 220, 21727, 20375, 15166, 21169, 15166, 22177, 11, 220, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 16843, 21169, 45367, 16843, 140, 115, 22177, 45035, 141, 227, 12466, 118, 15166, 22177, 141, 226, 30143, 18849, 31583, 20375, 25443, 110, 12466, 116, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 13, 12466, 248, 25443, 120, 140, 123, 16142, 22177, 18849, 18849, 12466, 116, 12466, 116, 141, 227, 12466, 123, 25443, 112, 43666, 16843, 21169, 140, 114, 18849, 38857, 16142, 141, 236, 141, 231, 18849, 16843, 12466, 123, 15166, 21727, 20375, 16142, 38857, 141, 231, 18849, 31583, 18849, 357, 140, 123, 21169, 25443, 111, 21169, 16142, 43108, 43108, 22177, 15166, 16843, 12466, 122, 140, 109, 16843, 21727, 140, 123, 16843, 141, 229, 16843, 22177, 18849, 16843, 11, 12466, 118, 15166, 22177, 21727, 16142, 30143, 20375, 18849, 22177, 140, 111, 8, 12466, 116, 43108, 16843, 141, 236, 20375, 12466, 115, 16142, 18849, 22177, 20375, 16843, 21169, 16843, 21727, 25443, 110, 16142, 22177, 22177, 15166, 21727, 20375, 45367, 12466, 110, 12466, 123, 21169, 16843, 43666, 21727, 20375, 16142, 38857, 30143, 16843, 22177, 18849, 18849, 220, 21169, 16843, 140, 115, 35072, 30143, 45367, 20375, 16142, 20375, 16142, 12466, 110, 12466, 123, 25443, 119, 25443, 114, 18849, 20375, 16843, 30143, 45367, 22177, 25443, 120, 220, 21727, 38857, 16843, 20375, 16843, 13, 12466, 248, 21169, 25443, 120, 16843, 220, 20375, 25443, 111, 15166, 11, 220, 141, 226, 16142, 31583, 20375, 18849, 141, 229, 16843, 21727, 31583, 18849, 16843, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 12466, 122, 140, 109, 45035, 141, 229, 22177, 15166, 12466, 123, 25443, 119, 35072, 141, 229, 16142, 141, 236, 20375, 12466, 123, 25443, 119, 45367, 140, 115, 35072, 12466, 116, 30143, 18849, 12466, 123, 15166, 21727, 20375, 21169, 16142, 43666, 16142, 141, 236, 20375, 12466, 122, 20375, 220, 21727, 30143, 35072, 141, 229, 16142, 140, 117, 22177, 45035, 141, 227, 220, 35072, 21727, 30143, 25443, 110, 18849, 140, 117, 11, 12466, 118, 15166, 20375, 15166, 21169, 45035, 16843, 12466, 121, 18849, 31583, 16142, 31583, 12466, 121, 16843, 220, 21727, 38857, 40623, 140, 115, 16142, 22177, 45035, 220, 21727, 12466, 118, 16142, 141, 229, 16843, 21727, 20375, 38857, 25443, 120, 12466, 116, 141, 227, 12466, 116, 21727, 140, 123, 25443, 119, 22177, 16843, 22177, 18849, 40623, 13, 12466, 253, 16843, 21169, 21727, 15166, 22177, 16142, 140, 114, 18849, 220, 141, 228, 16843, 140, 123, 15166, 141, 229, 31583, 18849, 12466, 123, 15166, 21727, 20375, 16142, 38857, 25443, 118, 532, 220, 141, 235, 20375, 15166, 12466, 120, 16843, 20375, 25443, 112, 25443, 119, 25443, 111, 18849, 141, 229, 16843, 21727, 31583, 18849, 140, 117, 12466, 122, 20375, 38857, 16843, 20375, 12466, 121, 16142, 220, 141, 235, 20375, 18849, 12466, 123, 21169, 25443, 109, 30143, 16843, 43108, 45035, 13]
-EncodingName: cl100k_base
-Sample:
-Encoded: []
+EncodingName: p50k_edit
+Sample: <|endoftext|>
+Encoded: [50256]
+
+EncodingName: p50k_edit
+Sample: <|fim_prefix|>
+Encoded: [50281]
+
+EncodingName: p50k_edit
+Sample: <|fim_middle|>
+Encoded: [50282]
+
+EncodingName: p50k_edit
+Sample: <|fim_suffix|>
+Encoded: [50283]
+
+EncodingName: p50k_edit
+Sample: Hello <|endoftext|> World
+Encoded: [15496, 220, 50256, 2159]
+
+EncodingName: p50k_edit
+Sample: <|endoftext|>This is a test<|endoftext|>
+Encoded: [50256, 1212, 318, 257, 1332, 50256]
+
+EncodingName: p50k_edit
+Sample: <|fim_prefix|>def hello():<|fim_suffix|> print('world')<|fim_middle|>
+Encoded: [50281, 4299, 23748, 33529, 50283, 50258, 3601, 10786, 6894, 11537, 50282]
+
+EncodingName: p50k_edit
+Sample: <|fim_prefix|>function test() {<|fim_suffix|>}<|fim_middle|> return true;
+Encoded: [50281, 8818, 1332, 3419, 1391, 50283, 92, 50282, 50258, 1441, 2081, 26]
EncodingName: cl100k_base
Sample: a
@@ -1038,9 +1078,49 @@ EncodingName: cl100k_base
Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы.
Encoded: [16604, 39233, 67124, 38438, 13433, 10693, 18154, 68374, 15088, 7820, 21708, 2297, 12, 6735, 23680, 47273, 11, 7820, 14837, 40590, 6850, 19188, 35667, 1506, 70129, 21204, 53419, 14525, 46177, 19175, 66144, 54570, 18868, 9239, 17766, 11, 18868, 11001, 112, 1506, 70129, 20879, 66386, 4929, 32985, 44786, 38098, 19916, 11320, 4898, 1830, 6856, 98051, 23297, 6856, 13, 36479, 12507, 8164, 14332, 1840, 7740, 7740, 10693, 40842, 7094, 58669, 28089, 1506, 90457, 50306, 18154, 68374, 14476, 1840, 17165, 320, 8164, 2233, 69954, 6578, 48150, 21923, 23297, 8164, 56857, 17618, 11, 38098, 2297, 16331, 1830, 19479, 15752, 8, 48835, 1532, 70129, 44946, 19479, 58060, 23297, 6856, 7486, 61379, 5927, 57925, 68374, 3114, 84198, 18600, 71889, 43203, 5927, 29619, 21956, 56008, 94962, 45916, 8341, 1532, 13, 36479, 2233, 12507, 1532, 11047, 22885, 11, 18034, 16248, 23311, 6148, 23297, 17165, 1532, 39233, 67124, 38438, 17165, 18154, 68374, 15088, 21923, 4655, 6148, 13999, 52432, 1506, 70129, 5173, 28647, 3865, 46177, 18154, 6735, 11001, 112, 1506, 70129, 20879, 90226, 19039, 44786, 97991, 6856, 44938, 11, 38153, 51736, 6850, 38822, 16248, 19175, 45916, 64084, 7486, 4655, 5524, 7820, 50223, 35034, 12507, 7740, 10693, 53687, 47680, 18999, 13, 23227, 7753, 2297, 17766, 38657, 1840, 39233, 67124, 38438, 17165, 18154, 68374, 15088, 482, 68979, 56729, 7975, 14837, 81301, 17165, 12415, 93747, 13373, 21599, 23311, 12561, 14082, 37131, 4655, 13]
-EncodingName: o200k_base
-Sample:
-Encoded: []
+EncodingName: cl100k_base
+Sample: <|endoftext|>
+Encoded: [100257]
+
+EncodingName: cl100k_base
+Sample: <|fim_prefix|>
+Encoded: [100258]
+
+EncodingName: cl100k_base
+Sample: <|fim_middle|>
+Encoded: [100259]
+
+EncodingName: cl100k_base
+Sample: <|fim_suffix|>
+Encoded: [100260]
+
+EncodingName: cl100k_base
+Sample: <|endofprompt|>
+Encoded: [100276]
+
+EncodingName: cl100k_base
+Sample: Hello <|endoftext|> World
+Encoded: [9906, 220, 100257, 4435]
+
+EncodingName: cl100k_base
+Sample: <|endoftext|>This is a test<|endoftext|>
+Encoded: [100257, 2028, 374, 264, 1296, 100257]
+
+EncodingName: cl100k_base
+Sample: <|fim_prefix|>def hello():<|fim_suffix|> print('world')<|fim_middle|>
+Encoded: [100258, 755, 24748, 4658, 100260, 262, 1194, 493, 14957, 873, 100259]
+
+EncodingName: cl100k_base
+Sample: <|fim_prefix|>function test() {<|fim_suffix|>}<|fim_middle|> return true;
+Encoded: [100258, 1723, 1296, 368, 314, 100260, 92, 100259, 262, 471, 837, 26]
+
+EncodingName: cl100k_base
+Sample: User: Hello<|endofprompt|>Assistant: Hi there!
+Encoded: [1502, 25, 22691, 100276, 72803, 25, 21694, 1070, 0]
+
+EncodingName: cl100k_base
+Sample: Question<|endofprompt|>Answer
+Encoded: [14924, 100276, 16533]
EncodingName: o200k_base
Sample: a
@@ -1297,3 +1377,364 @@ Encoded: [102415, 237, 102415, 236, 102415, 238, 102415, 232, 102415, 233, 10241
EncodingName: o200k_base
Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы.
Encoded: [3540, 121615, 4554, 20948, 39823, 2533, 36916, 369, 12, 749, 4260, 2779, 11, 21029, 56615, 10937, 39735, 7388, 31039, 89304, 11, 15356, 72215, 2814, 75580, 3959, 182012, 938, 29740, 938, 13, 130022, 20548, 816, 11229, 39471, 12590, 34491, 39823, 166426, 350, 9501, 162886, 7444, 181078, 11, 52047, 62242, 26951, 8, 59002, 133671, 5705, 8209, 743, 20311, 44678, 192243, 743, 35984, 198122, 4706, 11094, 13, 71337, 19182, 11, 26563, 79261, 13298, 121615, 51074, 39823, 2533, 69654, 172161, 140433, 7388, 15073, 714, 72215, 2814, 74971, 3959, 115462, 11, 16087, 121974, 1967, 26790, 8276, 669, 29268, 35963, 11229, 186377, 13, 60026, 106365, 62928, 121615, 51074, 39823, 2533, 533, 8577, 39009, 12062, 44920, 30168, 1235, 25671, 64524, 13]
+
+EncodingName: o200k_base
+Sample: <|endoftext|>
+Encoded: [199999]
+
+EncodingName: o200k_base
+Sample: <|endofprompt|>
+Encoded: [200018]
+
+EncodingName: o200k_base
+Sample: Hello <|endoftext|> World
+Encoded: [13225, 220, 199999, 5922]
+
+EncodingName: o200k_base
+Sample: <|endoftext|>This is a test<|endoftext|>
+Encoded: [199999, 2500, 382, 261, 1746, 199999]
+
+EncodingName: o200k_base
+Sample: User: Hello<|endofprompt|>Assistant: Hi there!
+Encoded: [1844, 25, 32949, 200018, 91655, 25, 19260, 1354, 0]
+
+EncodingName: o200k_base
+Sample: Question<|endofprompt|>Answer
+Encoded: [15143, 200018, 17045]
+
+EncodingName: o200k_harmony
+Sample: a
+Encoded: [64]
+
+EncodingName: o200k_harmony
+Sample: 1
+Encoded: [16]
+
+EncodingName: o200k_harmony
+Sample: a a
+Encoded: [64, 261]
+
+EncodingName: o200k_harmony
+Sample: hello
+Encoded: [24912]
+
+EncodingName: o200k_harmony
+Sample: Hello, World! How are you today? 🌍
+Encoded: [13225, 11, 5922, 0, 3253, 553, 481, 4044, 30, 130321, 235]
+
+EncodingName: o200k_harmony
+Sample: こんにちは、世界!お元気ですか?
+Encoded: [95839, 1395, 28428, 3393, 8930, 6753, 25717, 15121, 7128, 4802]
+
+EncodingName: o200k_harmony
+Sample: Hola, mundo! ¿Cómo estás hoy? 🇪🇸
+Encoded: [49864, 11, 10225, 0, 12873, 46515, 58166, 20502, 30, 173468, 103, 55506, 116]
+
+EncodingName: o200k_harmony
+Sample: Привет, мир! Как дела?
+Encoded: [23881, 131903, 11, 37934, 0, 26029, 78857, 30]
+
+EncodingName: o200k_harmony
+Sample: 안녕하세요, 세상! 오늘 기분이 어때요? 🇰🇷
+Encoded: [14307, 171731, 11, 28126, 8612, 0, 106820, 11061, 15567, 2186, 21252, 41856, 7952, 30, 173468, 108, 55506, 115]
+
+EncodingName: o200k_harmony
+Sample: Bonjour, le monde ! Comment ça va aujourd'hui ? 🇫🇷
+Encoded: [45751, 11, 505, 15807, 1073, 15406, 13590, 3423, 32226, 43820, 1423, 173468, 104, 55506, 115]
+
+EncodingName: o200k_harmony
+Sample: The quick brown fox jumps over 13 lazy dogs. 😺
+Encoded: [976, 4853, 19705, 68347, 65613, 1072, 220, 1311, 29082, 16798, 13, 22861, 118]
+
+EncodingName: o200k_harmony
+Sample: 1234567890!@#$%^&*()-=_+[]{};:'",.<>?/|`~ 🎉
+Encoded: [7633, 19354, 29338, 15, 0, 31, 108156, 108254, 5, 9, 31850, 51761, 10, 1951, 12083, 26, 9311, 672, 13, 28052, 153468, 91, 63, 93, 139786, 231]
+
+EncodingName: o200k_harmony
+Sample: C# is a great programming language for building apps.
+Encoded: [34, 2, 382, 261, 2212, 23238, 6439, 395, 6282, 12881, 13]
+
+EncodingName: o200k_harmony
+Sample: El área de un triángulo es (base * altura) / 2.
+Encoded: [4422, 20482, 334, 537, 12665, 30671, 5953, 878, 350, 5423, 425, 40031, 8, 820, 220, 17, 13]
+
+EncodingName: o200k_harmony
+Sample: Здравствуйте, это мой первый раз здесь. Что мне делать?
+Encoded: [182298, 11, 8577, 65733, 62134, 4702, 44039, 13, 53319, 27934, 45321, 30]
+
+EncodingName: o200k_harmony
+Sample: હેલો, વિશ્વ! તમે આજે કેમ છો? 🇮🇳
+Encoded: [6094, 187761, 11, 95706, 0, 52040, 59999, 104493, 72756, 30, 173468, 106, 55506, 111]
+
+EncodingName: o200k_harmony
+Sample: ความรักและการเป็นกันเองเป็นสิ่งสำคัญที่สุดในโลก 🇹🇭
+Encoded: [26224, 151737, 45798, 11855, 34044, 54361, 121316, 34044, 66688, 15177, 75160, 5131, 61134, 81833, 28208, 93469, 173468, 117, 55506, 255]
+
+EncodingName: o200k_harmony
+Sample: Python vs Java: Which programming language should you learn first?
+Encoded: [60502, 10217, 13114, 25, 21580, 23238, 6439, 1757, 481, 4484, 1577, 30]
+
+EncodingName: o200k_harmony
+Sample: A journey of a thousand miles begins with a single step. - Lao Tzu
+Encoded: [32, 12647, 328, 261, 26791, 10753, 18015, 483, 261, 4590, 5983, 13, 533, 144616, 353, 7846]
+
+EncodingName: o200k_harmony
+Sample: Die Grenzen meiner Sprache bedeuten die Grenzen meiner Welt. 🇩🇪
+Encoded: [8796, 111745, 39103, 89476, 93295, 9627, 1076, 111745, 39103, 23079, 13, 173468, 102, 55506, 103]
+
+EncodingName: o200k_harmony
+Sample: יש לי כמה שאלות בנוגע לפרויקט החדש שלך. 🇮🇱
+Encoded: [7899, 42151, 60962, 129852, 2433, 34083, 110495, 108591, 181894, 162562, 69019, 13, 173468, 106, 55506, 109]
+
+EncodingName: o200k_harmony
+Sample: Det är en vacker dag i Sverige. 🇸🇪
+Encoded: [3639, 7706, 469, 323, 17798, 8724, 575, 64714, 13, 173468, 116, 55506, 103]
+
+EncodingName: o200k_harmony
+Sample: A ∀ x (P(x) → Q(x)) ∧ (∃x P(x)) → ∃x Q(x)
+Encoded: [32, 35353, 222, 1215, 350, 47, 4061, 8, 15155, 1486, 4061, 915, 35353, 100, 350, 18085, 225, 87, 398, 4061, 915, 15155, 35353, 225, 87, 1486, 4061, 8]
+
+EncodingName: o200k_harmony
+Sample: O Brasil é o maior país da América do Sul. 🇧🇷
+Encoded: [46, 15278, 1212, 293, 15966, 11106, 1033, 45086, 621, 27109, 13, 173468, 100, 55506, 115]
+
+EncodingName: o200k_harmony
+Sample: L'amore è una forza potente che unisce le persone. 🇮🇹
+Encoded: [43, 30344, 510, 6272, 1969, 125511, 111848, 1378, 537, 48541, 505, 40144, 13, 173468, 106, 55506, 117]
+
+EncodingName: o200k_harmony
+Sample: Είναι μια ηλιόλουστη ημέρα στην Ελλάδα. 🇬🇷
+Encoded: [10303, 16239, 33246, 13115, 57330, 2097, 85087, 42851, 122278, 7648, 21399, 112618, 13, 173468, 105, 55506, 115]
+
+EncodingName: o200k_harmony
+Sample: Teslim tarihi yaklaşıyor, projeyi zamanında bitirmemiz gerekiyor. 🇹🇷
+Encoded: [110176, 5406, 162005, 16000, 148409, 17368, 11, 16022, 33468, 30355, 10884, 3546, 2835, 347, 482, 195151, 13, 173468, 117, 55506, 115]
+
+EncodingName: o200k_harmony
+Sample: Det finnes ingen bedre tid enn nå for å starte noe nytt. 🇳🇴
+Encoded: [3639, 145817, 30430, 56755, 8692, 23075, 19937, 395, 7086, 167203, 49921, 66369, 13, 173468, 111, 55506, 112]
+
+EncodingName: o200k_harmony
+Sample: Aanvaard de uitdagingen van het leven met moed en vastberadenheid. 🇳🇱
+Encoded: [68832, 84482, 334, 180964, 1164, 1448, 21987, 1421, 137256, 469, 11332, 718, 9519, 7157, 13, 173468, 111, 55506, 109]
+
+EncodingName: o200k_harmony
+Sample: Chào mừng bạn đến với thế giới của lập trình. 🇻🇳
+Encoded: [1205, 35134, 284, 75104, 22673, 27528, 18019, 46773, 69217, 12153, 96352, 49051, 13, 173468, 119, 55506, 111]
+
+EncodingName: o200k_harmony
+Sample: Dlaczego warto uczyć się języków obcych? 🇵🇱
+Encoded: [136923, 182265, 82074, 337, 150478, 9721, 140914, 3705, 87043, 1067, 55175, 30, 173468, 113, 55506, 109]
+
+EncodingName: o200k_harmony
+Sample: E = mc², uma equação famosa na física. 🇵🇹
+Encoded: [36, 314, 36958, 13848, 11, 3030, 2801, 3890, 96317, 898, 50251, 13, 173468, 113, 55506, 117]
+
+EncodingName: o200k_harmony
+Sample: 你今天遇到什么有趣的事情了吗?🇨🇳
+Encoded: [12370, 47256, 57127, 6946, 10555, 3666, 57922, 1616, 162913, 112451, 4802, 55506, 101, 55506, 111]
+
+EncodingName: o200k_harmony
+Sample: Nå er det tid for å feire med familie og venner. 🇳🇴
+Encoded: [45, 592, 1111, 1476, 8692, 395, 7086, 1193, 594, 1475, 39603, 2085, 131786, 13, 173468, 111, 55506, 112]
+
+EncodingName: o200k_harmony
+Sample: Þetta er góður dagur til að læra eitthvað nýtt. 🇮🇸
+Encoded: [7860, 20476, 1111, 91455, 17041, 8724, 330, 3453, 5993, 29333, 614, 180350, 49697, 1037, 13, 173468, 106, 55506, 116]
+
+EncodingName: o200k_harmony
+Sample: გამარჯობა! როგორ ხართ დღეს? 🇬🇪
+Encoded: [165502, 69106, 24045, 0, 57298, 10892, 10875, 55856, 30, 173468, 105, 55506, 103]
+
+EncodingName: o200k_harmony
+Sample: Mā te whakawhiti kōrero e whai hua ai tātou. 🇳🇿
+Encoded: [44, 2485, 729, 145047, 174352, 92760, 41643, 319, 101354, 76899, 8440, 260, 36813, 283, 13, 173468, 111, 55506, 123]
+
+EncodingName: o200k_harmony
+Sample: Это был незабываемый опыт, который я буду помнить всегда.
+Encoded: [63250, 11066, 37028, 66181, 42684, 6770, 67711, 11, 21903, 3277, 61571, 179329, 34056, 13]
+
+EncodingName: o200k_harmony
+Sample: Διαβάζοντας βιβλία, εμπλουτίζουμε τον εαυτό μας με γνώσεις.
+Encoded: [16611, 5690, 63324, 9153, 92025, 164613, 113428, 11, 109925, 85087, 30711, 9153, 33850, 20894, 4278, 727, 75653, 35170, 9173, 8558, 954, 92830, 13]
+
+EncodingName: o200k_harmony
+Sample: A számítástechnika világa tele van izgalmas lehetőségekkel. 🇭🇺
+Encoded: [32, 70578, 5348, 449, 168649, 3113, 11748, 449, 2225, 5443, 1164, 4297, 8298, 4227, 51215, 53922, 95521, 108844, 13, 173468, 255, 55506, 118]
+
+EncodingName: o200k_harmony
+Sample: Vždy je dobré mít plán B, pokud něco nevyjde. 🇨🇿
+Encoded: [53, 99728, 1264, 54560, 377, 98517, 192660, 418, 11, 118907, 134570, 453, 16670, 56244, 13, 173468, 101, 55506, 123]
+
+EncodingName: o200k_harmony
+Sample: Dragostea e un sentiment minunat care ne unește pe toți. 🇷🇴
+Encoded: [25765, 564, 12932, 319, 537, 39160, 182050, 266, 2631, 453, 2463, 74495, 1045, 316, 20660, 13, 173468, 115, 55506, 112]
+
+EncodingName: o200k_harmony
+Sample: دیکھو، آسمان میں کتنی تارے ہیں! 🇵🇰
+Encoded: [547, 55459, 417, 1368, 3382, 11248, 1195, 6431, 144008, 14148, 112711, 1531, 12406, 0, 173468, 113, 55506, 108]
+
+EncodingName: o200k_harmony
+Sample: Nenda polepole na ujifunze kila siku. 🇹🇿
+Encoded: [45, 5968, 25059, 112657, 898, 62112, 366, 119365, 52237, 54647, 13, 173468, 117, 55506, 123]
+
+EncodingName: o200k_harmony
+Sample: Каква е твоята любима храна? 🇧🇬
+Encoded: [29831, 2224, 2404, 70888, 8886, 2734, 13230, 27621, 2442, 73698, 30, 173468, 100, 55506, 105]
+
+EncodingName: o200k_harmony
+Sample: Sträva alltid efter att bli en bättre version av dig själv.
+Encoded: [3504, 450, 2873, 63479, 22852, 1927, 27757, 469, 100580, 3926, 1452, 3807, 71554, 13]
+
+EncodingName: o200k_harmony
+Sample: Філософія - це наука про знання. 🇺🇦
+Encoded: [10334, 17058, 107824, 30929, 533, 54543, 1235, 59929, 4964, 41072, 17561, 13, 173468, 118, 55506, 99]
+
+EncodingName: o200k_harmony
+Sample: Το πρόγραμμα αυτό είναι πολύ ενδιαφέρον. 🇬🇷
+Encoded: [63423, 198704, 43845, 17278, 60896, 162904, 171319, 13, 173468, 105, 55506, 115]
+
+EncodingName: o200k_harmony
+Sample: ^$%#*@!&)(_+=}{|:;"?><,~`'-./][
+Encoded: [61, 3, 4, 2, 154736, 0, 5, 168849, 18287, 29124, 91, 175979, 156569, 11, 93, 63, 44302, 2956, 2696]
+
+EncodingName: o200k_harmony
+Sample: 4gH@!0sT*#(9^%$[x{}j+|Yz6;Q]~8
+Encoded: [19, 70, 39, 31, 0, 15, 82, 51, 9, 2, 7, 24, 61, 4, 3, 58, 87, 12083, 73, 10, 91, 56, 89, 21, 26, 48, 60, 93, 23]
+
+EncodingName: o200k_harmony
+Sample: wNb)I<>#:i^P]*cR8ytUx1Q`6O@z/
+Encoded: [86, 67111, 8, 40, 28052, 97210, 72, 61, 47, 18579, 66, 49, 23, 5240, 182325, 16, 48, 63, 21, 46, 31, 89, 14]
+
+EncodingName: o200k_harmony
+Sample: ÄÜö¿¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
+Encoded: [12921, 8858, 573, 11986, 20407, 61242, 18943, 43470, 43625, 41468, 18596, 64259, 19742, 25661, 4244, 74285, 8980, 98049, 6793, 32438, 13848, 45681, 14737, 39621, 69022, 5366, 68284, 84125, 11006, 1924, 43439, 27124, 75174, 11986]
+
+EncodingName: o200k_harmony
+Sample: ƒšŠŒŽƒšŠŒŽƒšŠŒŽƒšŠŒŽƒšŠŒŽƒšŠŒŽ
+Encoded: [99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915, 99760, 812, 7490, 189136, 12915]
+
+EncodingName: o200k_harmony
+Sample: 5ħÅŸēýïūē$%#^*()_+{[ö&!@#?>|,.<>
+Encoded: [20, 5762, 13631, 198355, 6238, 1840, 9954, 7637, 6238, 3, 4, 2, 61, 9, 416, 62, 10, 90, 58, 573, 5, 0, 31, 2, 10730, 91, 26887, 28052]
+
+EncodingName: o200k_harmony
+Sample: 1B4t#%&*()_+dF5g^hJk7LmN0pQrS<>?
+Encoded: [16, 33, 19, 83, 2, 4, 5, 9, 416, 62, 10, 67, 37, 20, 70, 61, 71, 41, 74, 22, 196093, 45, 15, 79, 135047, 50, 28052, 30]
+
+EncodingName: o200k_harmony
+Sample: ¬§±²³µ¶·¹ºª«»¦©¯°±!@#$%^&*()_+
+Encoded: [74285, 18596, 32438, 13848, 45681, 39621, 69022, 5366, 84125, 11006, 25661, 4244, 1924, 41468, 19742, 98049, 6793, 32438, 0, 31, 108156, 108254, 5, 9, 416, 62, 10]
+
+EncodingName: o200k_harmony
+Sample: 8mR5*w7^a$!F(0%#J9@X6vZ1)nU3]_Y/
+Encoded: [23, 76, 49, 20, 147727, 22, 61, 64, 3, 0, 37, 7, 15, 4, 2, 41, 24, 31, 55, 21, 85, 57, 16, 143612, 52, 18, 167793, 56, 14]
+
+EncodingName: o200k_harmony
+Sample: 😊😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺️🙂🤗🤔
+Encoded: [102630, 84083, 156437, 41736, 92916, 13865, 225, 13865, 226, 13865, 227, 13865, 228, 72041, 102630, 13865, 233, 13865, 236, 74762, 122588, 13865, 245, 13865, 247, 13865, 248, 155014, 15148, 37459, 50378, 245, 50378, 242]
+
+EncodingName: o200k_harmony
+Sample: 🤨😐😑😶🙄😏😣😥😮🤐😯😪😫😴😌🤓😛😜😝🤤
+Encoded: [50378, 101, 13865, 238, 13865, 239, 13865, 114, 70125, 226, 13865, 237, 13865, 96, 13865, 98, 13865, 106, 50378, 238, 13865, 107, 13865, 103, 13865, 104, 13865, 112, 13865, 234, 50378, 241, 13865, 249, 13865, 250, 13865, 251, 50378, 97]
+
+EncodingName: o200k_harmony
+Sample: 😒😓😔😕🙃🤑😲😷🤒🤕🤢🤧😈👿👹👺💀☠️
+Encoded: [13865, 240, 13865, 241, 13865, 242, 13865, 243, 70125, 225, 4103, 11566, 13865, 110, 13865, 115, 50378, 240, 50378, 243, 50378, 95, 50378, 100, 13865, 230, 28823, 123, 28823, 117, 28823, 118, 31446, 222, 8434, 254, 15148]
+
+EncodingName: o200k_harmony
+Sample: 😾😿🙀😽😼😻🙈🙉🙊👶👦👧👨👩👴👵👨⚕️👩⚕️
+Encoded: [13865, 122, 13865, 123, 70125, 222, 13865, 121, 13865, 120, 13865, 119, 70125, 230, 70125, 231, 70125, 232, 28823, 114, 28823, 99, 28823, 100, 28823, 101, 28823, 102, 28823, 112, 28823, 113, 28823, 101, 2524, 84396, 243, 15148, 28823, 102, 2524, 84396, 243, 15148]
+
+EncodingName: o200k_harmony
+Sample: 🌞🌝🌚🌛🌜🌙⭐️🌟💫✨🔥💥☄️🌈☀️🌤️⛅️🌥️
+Encoded: [64364, 252, 64364, 251, 64364, 248, 64364, 249, 64364, 250, 64364, 247, 62160, 15148, 64364, 253, 31446, 104, 97375, 96606, 31446, 98, 8434, 226, 15148, 64364, 230, 8434, 222, 15148, 64364, 97, 15148, 158, 249, 227, 15148, 64364, 98, 15148]
+
+EncodingName: o200k_harmony
+Sample: 🍏🍎🍐🍊🍋🍌🍉🍇🍓🍈🍒🍑
+Encoded: [102415, 237, 102415, 236, 102415, 238, 102415, 232, 102415, 233, 102415, 234, 102415, 231, 102415, 229, 102415, 241, 102415, 230, 102415, 240, 102415, 239]
+
+EncodingName: o200k_harmony
+Sample: В цепочках поставок кейс-стадии, когда называются одна или несколько сторон, страдают от серьезных конфликтов интересов. Компании и их поддерживающие поставщики (программное обеспечение, консалтинг) имеют заинтересованность в представлении результата в положительном свете. Кроме того, фактические цепочки поставок обычно получают пользу или пострадают от случайных условий, которые никак не связаны с качеством их исполнения. Персонажи цепочки поставок - это методологический ответ на эти проблемы.
+Encoded: [3540, 121615, 4554, 20948, 39823, 2533, 36916, 369, 12, 749, 4260, 2779, 11, 21029, 56615, 10937, 39735, 7388, 31039, 89304, 11, 15356, 72215, 2814, 75580, 3959, 182012, 938, 29740, 938, 13, 130022, 20548, 816, 11229, 39471, 12590, 34491, 39823, 166426, 350, 9501, 162886, 7444, 181078, 11, 52047, 62242, 26951, 8, 59002, 133671, 5705, 8209, 743, 20311, 44678, 192243, 743, 35984, 198122, 4706, 11094, 13, 71337, 19182, 11, 26563, 79261, 13298, 121615, 51074, 39823, 2533, 69654, 172161, 140433, 7388, 15073, 714, 72215, 2814, 74971, 3959, 115462, 11, 16087, 121974, 1967, 26790, 8276, 669, 29268, 35963, 11229, 186377, 13, 60026, 106365, 62928, 121615, 51074, 39823, 2533, 533, 8577, 39009, 12062, 44920, 30168, 1235, 25671, 64524, 13]
+
+EncodingName: o200k_harmony
+Sample: <|endoftext|>
+Encoded: [199999]
+
+EncodingName: o200k_harmony
+Sample: <|endofprompt|>
+Encoded: [200018]
+
+EncodingName: o200k_harmony
+Sample: <|startoftext|>
+Encoded: [199998]
+
+EncodingName: o200k_harmony
+Sample: <|reserved_200000|>
+Encoded: [200000]
+
+EncodingName: o200k_harmony
+Sample: <|reserved_200001|>
+Encoded: [200001]
+
+EncodingName: o200k_harmony
+Sample: <|return|>
+Encoded: [200002]
+
+EncodingName: o200k_harmony
+Sample: <|constrain|>
+Encoded: [200003]
+
+EncodingName: o200k_harmony
+Sample: <|reserved_200004|>
+Encoded: [200004]
+
+EncodingName: o200k_harmony
+Sample: <|channel|>
+Encoded: [200005]
+
+EncodingName: o200k_harmony
+Sample: <|start|>
+Encoded: [200006]
+
+EncodingName: o200k_harmony
+Sample: Hello <|endoftext|> World
+Encoded: [13225, 220, 199999, 5922]
+
+EncodingName: o200k_harmony
+Sample: <|endoftext|>This is a test<|endoftext|>
+Encoded: [199999, 2500, 382, 261, 1746, 199999]
+
+EncodingName: o200k_harmony
+Sample: User: Hello<|endofprompt|>Assistant: Hi there!
+Encoded: [1844, 25, 32949, 200018, 91655, 25, 19260, 1354, 0]
+
+EncodingName: o200k_harmony
+Sample: Question<|endofprompt|>Answer
+Encoded: [15143, 200018, 17045]
+
+EncodingName: o200k_harmony
+Sample: <|startoftext|>Hello World<|endoftext|>
+Encoded: [199998, 13225, 5922, 199999]
+
+EncodingName: o200k_harmony
+Sample: <|call|>function_name<|return|>result
+Encoded: [200012, 2706, 2483, 200002, 2521]
+
+EncodingName: o200k_harmony
+Sample: <|message|>user<|constrain|>safe<|channel|>text
+Encoded: [200008, 1428, 200003, 46891, 200005, 919]
+
+EncodingName: o200k_harmony
+Sample: <|start|>conversation<|message|>content<|end|>
+Encoded: [200006, 129279, 200008, 3252, 200007]
+
+EncodingName: o200k_harmony
+Sample: <|reserved_200000|>
+Encoded: [200000]
+
+EncodingName: o200k_harmony
+Sample: Text with <|reserved_200000|> reserved token
+Encoded: [1279, 483, 220, 200000, 9924, 6602]
+
diff --git a/SharpToken/Lib/Internals/ModelParamsGenerator.cs b/SharpToken/Lib/Internals/ModelParamsGenerator.cs
index e8e090f..15eefc8 100644
--- a/SharpToken/Lib/Internals/ModelParamsGenerator.cs
+++ b/SharpToken/Lib/Internals/ModelParamsGenerator.cs
@@ -56,6 +56,9 @@ public static ModelParams GetModelParams(string encodingName)
case "o200k_base":
return O200KBase();
+ case "o200k_harmony":
+ return O200KHarmony();
+
default:
throw new ArgumentException($"Unknown encoding name: {encodingName}");
}
@@ -140,6 +143,56 @@ private static ModelParams O200KBase()
specialTokens: specialTokens
);
}
+
+ private static ModelParams O200KHarmony()
+ {
+ // O200K Harmony reuses the same mergeable ranks as O200K Base but has extended special tokens
+ var mergeableRanks = EmbeddedResourceReader.LoadTokenBytePairEncoding("SharpToken.data.o200k_base.tiktoken");
+
+ var specialTokens = new Dictionary
+ {
+ // Base O200K special tokens (from o200k_base)
+ { EndOfText, 199999 },
+ { EndOfPrompt, 200018 }, // This will be overwritten by reserved_200018, but allows recognition
+
+ // Additional O200K Harmony special tokens
+ { "<|startoftext|>", 199998 },
+ { "<|reserved_200000|>", 200000 },
+ { "<|reserved_200001|>", 200001 },
+ { "<|return|>", 200002 },
+ { "<|constrain|>", 200003 },
+ { "<|reserved_200004|>", 200004 },
+ { "<|channel|>", 200005 },
+ { "<|start|>", 200006 },
+ { "<|end|>", 200007 },
+ { "<|message|>", 200008 },
+ { "<|reserved_200009|>", 200009 },
+ { "<|reserved_200010|>", 200010 },
+ { "<|reserved_200011|>", 200011 },
+ { "<|call|>", 200012 }
+ };
+
+ // Add reserved tokens from 200013 to 201087
+ // Note: We skip 200018 to avoid duplicate key with EndOfPrompt
+ for (int i = 200013; i < 201088; i++)
+ {
+ if (i == 200018)
+ {
+ // Skip 200018 to avoid duplicate key exception
+ // Both <|endofprompt|> and <|reserved_200018|> would map to 200018
+ // We keep <|endofprompt|> for compatibility
+ continue;
+ }
+ specialTokens[$"<|reserved_{i}|>"] = i;
+ }
+
+ return new ModelParams
+ (
+ tokenizerRegex: ModelParamsGeneratorRegex.RegexO200KBase(),
+ mergeableRanks: mergeableRanks,
+ specialTokens: specialTokens
+ );
+ }
}
internal sealed partial class ModelParamsGeneratorRegex
diff --git a/SharpToken/Lib/Model.cs b/SharpToken/Lib/Model.cs
index 2d0f75f..99b2fee 100644
--- a/SharpToken/Lib/Model.cs
+++ b/SharpToken/Lib/Model.cs
@@ -9,6 +9,11 @@ public static class Model
{
// chat
{ "gpt-4o", "o200k_base" },
+ { "gpt-5", "o200k_base" },
+ { "gpt-5-mini", "o200k_base" },
+ { "gpt-5-nano", "o200k_base" },
+ { "gpt-5-pro", "o200k_base" },
+ { "gpt-5-thinking", "o200k_base" },
{ "gpt-4", "cl100k_base" },
{ "gpt-3.5-turbo-16k", "cl100k_base" },
{ "gpt-35-turbo-16k", "cl100k_base" }, // Azure deployment name
@@ -54,6 +59,7 @@ public static class Model
private static readonly Dictionary ModelPrefixToEncodingMapping = new Dictionary
{
+ { "gpt-5", "o200k_base" }, // e.g., gpt-5-2024-08-07, gpt-5-chat-latest, etc.
{ "gpt-4o", "o200k_base" }, // (NOTE: no trailing dash, on purpose). E.g., gpt-4o, gpt-4o-2024-05-13, etc.,
{ "gpt-4-", "cl100k_base" }, // e.g., gpt-4-0314, etc., plus gpt-4-32k
{ "gpt-3.5-turbo-", "cl100k_base" }, // e.g, gpt-3.5-turbo-0301, -0401, etc.