Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion astrbot/core/message/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,13 @@ class Reply(BaseMessageComponent):
def __init__(self, **_):
super().__init__(**_)

async def to_dict(self) -> dict:
chain = self.chain if self.chain is not None else []
return {
"type": "reply",
"data": {"id": self.id, "chain": [await comp.to_dict() for comp in chain]},
}


class Poke(BaseMessageComponent):
type: str = ComponentType.Poke
Expand Down Expand Up @@ -630,12 +637,23 @@ async def to_dict(self) -> dict:
class Json(BaseMessageComponent):
type = ComponentType.Json
data: dict
raw_data: str | None = None

def __init__(self, data: str | dict, **_):
if isinstance(data, str):
data = json.loads(data)
try:
self.raw_data = data
data = json.loads(data)
except json.JSONDecodeError:
data = {"raw": data}
super().__init__(data=data, **_)

async def to_dict(self) -> dict:
return {
"type": "json",
"data": {"content": getattr(self, "raw_data", json.dumps(self.data))},
}


class Unknown(BaseMessageComponent):
type = ComponentType.Unknown
Expand Down
69 changes: 57 additions & 12 deletions astrbot/core/provider/sources/openai_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,18 +247,63 @@ async def _parse_openai_completion(

# parse the text completion
if choice.message.content is not None:
# text completion
completion_text = str(choice.message.content).strip()
# specially, some providers may set <think> tags around reasoning content in the completion text,
# we use regex to remove them, and store then in reasoning_content field
reasoning_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
matches = reasoning_pattern.findall(completion_text)
if matches:
llm_response.reasoning_content = "\n".join(
[match.strip() for match in matches],
)
completion_text = reasoning_pattern.sub("", completion_text).strip()
llm_response.result_chain = MessageChain().message(completion_text)
# content can be either a plain string or a multimodal list
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (complexity): 建议把多模态内容处理、图片处理以及 <think> 解析拆分到独立的辅助函数中,让 _parse_openai_completion 主要负责协调调用。

你可以保持当前的新功能,但通过将多模态解析以及图片/推理处理抽取为几个小的 helper 函数来降低复杂度。这样 _parse_openai_completion 就主要做流程编排,分支嵌套会明显减少。

例如,可以把多模态处理抽取出来:

def _build_message_chain_from_content(self, content) -> tuple[MessageChain, str | None]:
    mc = MessageChain()
    reasoning_parts: list[str] = []

    if isinstance(content, list):
        for part in content:
            if not isinstance(part, dict):
                mc.message(str(part))
                continue

            ptype = part.get("type")
            if ptype == "text":
                mc.message(part.get("text", ""))
            elif ptype == "image_url":
                self._append_image_part(mc, part.get("image_url"))
            elif ptype == "think":
                think_val = part.get("think")
                if think_val:
                    reasoning_parts.append(str(think_val))
            else:
                mc.message(json.dumps(part, ensure_ascii=False))
    else:
        # plain string, including <think> tag handling
        completion_text, reasoning = self._extract_reasoning_from_text(str(content))
        mc.message(completion_text)
        if reasoning:
            reasoning_parts.append(reasoning)

    reasoning_content = None
    if reasoning_parts:
        reasoning_content = "\n".join(rp.strip() for rp in reasoning_parts)

    return mc, reasoning_content

将图片 URL 处理再单独封装,以避免主流程中的嵌套条件判断:

def _append_image_part(self, mc: MessageChain, image_field: Any) -> None:
    url = image_field.get("url") if isinstance(image_field, dict) else image_field
    if not url or not isinstance(url, str):
        return

    if "base64," in url:
        mc.base64_image(url.split("base64,", 1)[1])
    elif url.startswith("base64://"):
        mc.base64_image(url.replace("base64://", ""))
    else:
        mc.url_image(url)

再为 <think> 标签处理写一个小 helper:

def _extract_reasoning_from_text(self, text: str) -> tuple[str, str | None]:
    reasoning_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
    matches = reasoning_pattern.findall(text)
    reasoning = None
    if matches:
        reasoning = "\n".join(match.strip() for match in matches)
        text = reasoning_pattern.sub("", text).strip()
    return text.strip(), reasoning

这样 _parse_openai_completion 就可以简化为:

if choice.message.content is not None:
    mc, reasoning_from_content = self._build_message_chain_from_content(
        choice.message.content
    )
    if reasoning_from_content:
        llm_response.reasoning_content = reasoning_from_content
    llm_response.result_chain = mc

# still let _extract_reasoning_content override if present
llm_response.reasoning_content = self._extract_reasoning_content(completion)

这样既保留了所有行为(多模态、文本、<think> 标签、不同图片格式、未知类型分支),又把复杂的分支逻辑移出了核心解析函数。

Original comment in English

issue (complexity): Consider extracting the multimodal content, image handling, and <think> parsing into dedicated helper functions so _parse_openai_completion mainly orchestrates them.

You can keep the new functionality but reduce complexity by pulling the multimodal parsing and image/reasoning handling into small helpers. That makes _parse_openai_completion mostly orchestration and flattens the nesting.

For example, extract the multimodal handling:

def _build_message_chain_from_content(self, content) -> tuple[MessageChain, str | None]:
    mc = MessageChain()
    reasoning_parts: list[str] = []

    if isinstance(content, list):
        for part in content:
            if not isinstance(part, dict):
                mc.message(str(part))
                continue

            ptype = part.get("type")
            if ptype == "text":
                mc.message(part.get("text", ""))
            elif ptype == "image_url":
                self._append_image_part(mc, part.get("image_url"))
            elif ptype == "think":
                think_val = part.get("think")
                if think_val:
                    reasoning_parts.append(str(think_val))
            else:
                mc.message(json.dumps(part, ensure_ascii=False))
    else:
        # plain string, including <think> tag handling
        completion_text, reasoning = self._extract_reasoning_from_text(str(content))
        mc.message(completion_text)
        if reasoning:
            reasoning_parts.append(reasoning)

    reasoning_content = None
    if reasoning_parts:
        reasoning_content = "\n".join(rp.strip() for rp in reasoning_parts)

    return mc, reasoning_content

Factor image URL handling separately to avoid nested conditionals in the main logic:

def _append_image_part(self, mc: MessageChain, image_field: Any) -> None:
    url = image_field.get("url") if isinstance(image_field, dict) else image_field
    if not url or not isinstance(url, str):
        return

    if "base64," in url:
        mc.base64_image(url.split("base64,", 1)[1])
    elif url.startswith("base64://"):
        mc.base64_image(url.replace("base64://", ""))
    else:
        mc.url_image(url)

And a small helper for the <think>-tag case:

def _extract_reasoning_from_text(self, text: str) -> tuple[str, str | None]:
    reasoning_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
    matches = reasoning_pattern.findall(text)
    reasoning = None
    if matches:
        reasoning = "\n".join(match.strip() for match in matches)
        text = reasoning_pattern.sub("", text).strip()
    return text.strip(), reasoning

Then _parse_openai_completion becomes less branched:

if choice.message.content is not None:
    mc, reasoning_from_content = self._build_message_chain_from_content(
        choice.message.content
    )
    if reasoning_from_content:
        llm_response.reasoning_content = reasoning_from_content
    llm_response.result_chain = mc

# still let _extract_reasoning_content override if present
llm_response.reasoning_content = self._extract_reasoning_content(completion)

This keeps all behaviors (multimodal, text, <think> tags, image formats, unknown parts) but moves the detailed branching out of the central parsing function.

content = choice.message.content
# handle multimodal content returned as a list of parts
if isinstance(content, list):
reasoning_parts = []
mc = MessageChain()
for part in content:
if not isinstance(part, dict):
# fallback: append as plain text
mc.message(str(part))
continue
ptype = part.get("type")
if ptype == "text":
Comment on lines +253 to +262
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): 多模态响应中的文本部分现在不会再去除 标签,也不会填充 reasoning_content,这与纯字符串路径的行为不一致。

在多模态的 list 分支中,ptype == "text" 时只是直接追加 part["text"],而字符串分支仍然会把 <think>...</think> 提取到 reasoning_content 中,并从可见输出中移除。这意味着多模态文本中的任何 <think> 内容现在都会直接展示给用户,而且两种路径的处理方式不一致。你很可能需要在这里对文本部分也执行相同的基于正则的提取逻辑(并把匹配结果追加到 reasoning_parts),从而让多模态和纯文本响应的处理保持一致。

建议实现如下:

            # content can be either a plain string or a multimodal list
            content = choice.message.content
            # handle multimodal content returned as a list of parts
            if isinstance(content, list):
                reasoning_parts = []
                mc = MessageChain()
                for part in content:
                    if not isinstance(part, dict):
                        # fallback: append as plain text
                        mc.message(str(part))
                        continue
                    ptype = part.get("type")
                    if ptype == "text":
                        text = part.get("text", "") or ""
                        if text:
                            # extract <think>...</think> segments into reasoning_parts
                            try:
                                import re  # local import in case module-level import is not present
                                think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
                                if think_matches:
                                    reasoning_parts.extend(think_matches)
                                    # strip all <think> blocks from the visible completion text
                                    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
                            except Exception:
                                # on any failure, fall back to using the raw text
                                pass
                        mc.message(text)
                    elif ptype == "image_url":
                        image_field = part.get("image_url")
                        url = None
                        if isinstance(image_field, dict):
                            url = image_field.get("url")
                        else:
                            url = image_field
                        if url:
                            # 
                            if isinstance(url, str) and "base64," in url:
                                base64_data = url.split("base64,", 1)[1]
                                mc.base64_image(base64_data)
  1. 如果字符串分支中已经有用于提取 <think> 标签的共享正则或辅助函数(例如预编译的 pattern 或工具方法),请在这里复用,而不是在本地使用新的 re.findall / re.sub,以保持一致性。
  2. 如果模块顶层已经导入了 re,就可以去掉 try 块里的 import re,直接依赖模块级导入,避免重复导入。
  3. 请确保在该函数的其他位置,会把 reasoning_parts 合并并赋值给与字符串路径中相同的 reasoning_content(或等价字段),从而让调用方获得统一的推理内容。
Original comment in English

suggestion (bug_risk): Text parts in multimodal responses no longer strip tags or populate reasoning_content, unlike the string path.

In the multimodal list branch, ptype == "text" just appends part["text"] as-is, while the string-only branch still extracts <think>...</think> into reasoning_content and strips it from the visible completion. This means any <think> in multimodal text parts will now be shown to users and handled inconsistently between the two paths. You likely want to run the same regex-based extraction on text parts here (and append matches to reasoning_parts) so multimodal and string responses are treated consistently.

Suggested implementation:

            # content can be either a plain string or a multimodal list
            content = choice.message.content
            # handle multimodal content returned as a list of parts
            if isinstance(content, list):
                reasoning_parts = []
                mc = MessageChain()
                for part in content:
                    if not isinstance(part, dict):
                        # fallback: append as plain text
                        mc.message(str(part))
                        continue
                    ptype = part.get("type")
                    if ptype == "text":
                        text = part.get("text", "") or ""
                        if text:
                            # extract <think>...</think> segments into reasoning_parts
                            try:
                                import re  # local import in case module-level import is not present
                                think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
                                if think_matches:
                                    reasoning_parts.extend(think_matches)
                                    # strip all <think> blocks from the visible completion text
                                    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
                            except Exception:
                                # on any failure, fall back to using the raw text
                                pass
                        mc.message(text)
                    elif ptype == "image_url":
                        image_field = part.get("image_url")
                        url = None
                        if isinstance(image_field, dict):
                            url = image_field.get("url")
                        else:
                            url = image_field
                        if url:
                            # 
                            if isinstance(url, str) and "base64," in url:
                                base64_data = url.split("base64,", 1)[1]
                                mc.base64_image(base64_data)
  1. If there is already a shared regex or helper used in the string-only path for extracting <think> tags (e.g. a compiled pattern or utility function), replace the inline re.findall/re.sub with that shared mechanism for consistency.
  2. If re is already imported at the module level, you can safely remove the import re inside the try block to avoid duplicate imports and rely on the module-level import instead.
  3. Ensure that elsewhere in this function, reasoning_parts is joined and assigned to the same reasoning_content (or equivalent) field that is used in the string-only path so that callers receive a unified reasoning payload.

mc.message(part.get("text", ""))
elif ptype == "image_url":
image_field = part.get("image_url")
url = None
if isinstance(image_field, dict):
url = image_field.get("url")
else:
url = image_field
if url:
# 
if isinstance(url, str) and "base64," in url:
base64_data = url.split("base64,", 1)[1]
mc.base64_image(base64_data)
elif isinstance(url, str) and url.startswith("base64://"):
mc.base64_image(url.replace("base64://", ""))
else:
mc.url_image(url)
elif ptype == "think":
# collect reasoning parts for later extraction
think_val = part.get("think")
if think_val:
reasoning_parts.append(str(think_val))
else:
# unknown part type, append its textual representation
mc.message(json.dumps(part, ensure_ascii=False))

if reasoning_parts:
llm_response.reasoning_content = "\n".join(
[rp.strip() for rp in reasoning_parts]
)
llm_response.result_chain = mc
else:
# text completion (string)
completion_text = str(content).strip()
# specially, some providers may set <think> tags around reasoning content in the completion text,
# we use regex to remove them, and store then in reasoning_content field
reasoning_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
matches = reasoning_pattern.findall(completion_text)
if matches:
llm_response.reasoning_content = "\n".join(
[match.strip() for match in matches],
)
completion_text = reasoning_pattern.sub("", completion_text).strip()
llm_response.result_chain = MessageChain().message(completion_text)

# parse the reasoning content if any
# the priority is higher than the <think> tag extraction
Expand Down