From 62b7be5b479c50cfce2f11963e83c8130e708d6d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 5 Sep 2025 16:41:02 +0000 Subject: [PATCH 1/2] Fix issue #1: Add documents --- athena/utils/llm_util.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/athena/utils/llm_util.py b/athena/utils/llm_util.py index f7c40f5..29b81e3 100644 --- a/athena/utils/llm_util.py +++ b/athena/utils/llm_util.py @@ -6,14 +6,39 @@ def str_token_counter(text: str) -> int: + """Counts the number of tokens in a string using tiktoken's o200k_base encoding. + + Args: + text: The input string to count tokens for. + + Returns: + The number of tokens in the input string. + """ enc = tiktoken.get_encoding("o200k_base") return len(enc.encode(text)) def tiktoken_counter(messages: Sequence[BaseMessage]) -> int: - """Approximately reproduce https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb + """Counts tokens across multiple message types using tiktoken tokenization. + + Approximately reproduces the token counting methodology from OpenAI's cookbook: + https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb + + Args: + messages: A sequence of BaseMessage objects (HumanMessage, AIMessage, + ToolMessage, or SystemMessage) to count tokens for. + + Returns: + The total number of tokens across all messages, including overhead tokens. + + Raises: + ValueError: If an unsupported message type is encountered. - For simplicity only supports str Message.contents. + Notes: + - Uses a fixed overhead of 3 tokens for reply priming + - Adds 3 tokens per message for message formatting + - Adds 1 token per message name if present + - For simplicity, only supports string message contents """ output_parser = StrOutputParser() num_tokens = 3 # every reply is primed with <|start|>assistant<|message|> From cd87a726cacd0f07e68894071de7e08c8b1b6f4b Mon Sep 17 00:00:00 2001 From: Yue Pan <79363355+dcloud347@users.noreply.github.com> Date: Sat, 6 Sep 2025 00:46:13 +0800 Subject: [PATCH 2/2] style: fix formatting in llm_util.py to improve readability --- athena/utils/llm_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/athena/utils/llm_util.py b/athena/utils/llm_util.py index 29b81e3..db02477 100644 --- a/athena/utils/llm_util.py +++ b/athena/utils/llm_util.py @@ -25,7 +25,7 @@ def tiktoken_counter(messages: Sequence[BaseMessage]) -> int: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb Args: - messages: A sequence of BaseMessage objects (HumanMessage, AIMessage, + messages: A sequence of BaseMessage objects (HumanMessage, AIMessage, ToolMessage, or SystemMessage) to count tokens for. Returns: