Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions experiments/benchmarks/HLE/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
from src.envs.hle import HLEEnv
from src.solver.single_agent import SingleAgentSolver
from src.memory.methods.empty import EmptyMemory
from src.memory import GenerativeMASMemory, VoyagerMASMemory, MemoryBankMASMemory
from src.utils import EmbeddingFunc
from src.reasoning import ReasoningIO
from src.llm.model_caller import ModelCaller
from src.llm.token_tracker import token_tracker
Expand Down Expand Up @@ -390,14 +392,44 @@ def run(cfg: dict, logger: logging.Logger) -> None:
)
reasoning = ReasoningIO(llm_model=solver_caller)

working_dir = mem_cfg.get("working_dir", str(output_dir / "memory_store"))
Path(working_dir).mkdir(parents=True, exist_ok=True)
memory = EmptyMemory(
namespace=mem_cfg.get("namespace", "hle_empty"),
global_config={"working_dir": str(ts_dir / "memory_store")},
llm_model=None,
embedding_func=None,
)
memory_store_dir = str(ts_dir / "memory_store")
Path(memory_store_dir).mkdir(parents=True, exist_ok=True)

memory_method = exp_cfg.get("memory_method", "empty")
if memory_method == "empty":
memory = EmptyMemory(
namespace=mem_cfg.get("namespace", "hle_empty"),
global_config={"working_dir": memory_store_dir},
llm_model=None,
embedding_func=None,
)
elif memory_method in ("generative", "voyager", "memorybank"):
memory_caller = ModelCaller(
model=model_cfg["solver"],
role="memory",
base_url=base_url,
)
embedding_func = EmbeddingFunc()
mem_global_config = {
"working_dir": memory_store_dir,
"successful_topk": mem_cfg.get("successful_topk", 1),
"failed_topk": mem_cfg.get("failed_topk", 1),
}
mem_cls = {
"generative": GenerativeMASMemory,
"voyager": VoyagerMASMemory,
"memorybank": MemoryBankMASMemory,
}[memory_method]
memory = mem_cls(
namespace=mem_cfg.get("namespace", f"hle_{memory_method}"),
global_config=mem_global_config,
llm_model=memory_caller,
embedding_func=embedding_func,
)
logger.info(f"Memory: {memory_method} (namespace={memory.namespace})")
else:
logger.error(f"Unknown memory_method: {memory_method}")
sys.exit(1)

framework = exp_cfg.get("agent_framework", "single_agent")
if framework == "single_agent":
Expand Down
8 changes: 8 additions & 0 deletions experiments/configs/memory/generative.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
experiment:
memory_method: generative

memory_config:
namespace: hle_generative
working_dir: experiments/results/memory_store
successful_topk: 1
failed_topk: 1
8 changes: 8 additions & 0 deletions experiments/configs/memory/memorybank.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
experiment:
memory_method: memorybank

memory_config:
namespace: hle_memorybank
working_dir: experiments/results/memory_store
successful_topk: 1
failed_topk: 1
8 changes: 8 additions & 0 deletions experiments/configs/memory/voyager.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
experiment:
memory_method: voyager

memory_config:
namespace: hle_voyager
working_dir: experiments/results/memory_store
successful_topk: 1
failed_topk: 1
4 changes: 4 additions & 0 deletions experiments/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,10 @@ def main():
except (FileNotFoundError, AttributeError) as e:
logger.error(str(e))
sys.exit(1)
except Exception as e:
logger.error(f"load_runner() 异常: {type(e).__name__}: {e}")
import traceback; traceback.print_exc()
sys.exit(1)

runner.run(cfg, logger)

Expand Down
77 changes: 77 additions & 0 deletions experiments/scripts/run_HLE_memory_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env bash
# experiments/scripts/run_HLE_memory_test.sh
#
# 三种 Memory 方法的 HLE 适配测试脚本。
#
# 用法:
# # 正式跑(每种 50 题)
# bash experiments/scripts/run_HLE_memory_test.sh
#
# # 冒烟测试(每种 2 题)
# bash experiments/scripts/run_HLE_memory_test.sh --smoke
#
# # 只跑某一种(generative / voyager / memorybank)
# bash experiments/scripts/run_HLE_memory_test.sh --only generative

set -euo pipefail
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY

# ── API 配置 ──────────────────────────────────────────────────────────────────
export OPENAI_API_KEY="sk-eb639510e766dc2868bc1974e678a055f6cba2bb351a74cb2696e46d24d360f3"
export OPENAI_API_BASE="https://gmn.chuangzuoli.com"

# ── 路径 ──────────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$REPO_ROOT"

ENV_CFG="experiments/configs/envs/hle.yaml"
SOLVER_CFG="experiments/configs/solver/single_agent.yaml"
TOOL_CFG="experiments/configs/tool/default.yaml"

# ── 参数解析 ──────────────────────────────────────────────────────────────────
LIMIT=50
ONLY=""
for arg in "$@"; do
case $arg in
--smoke) LIMIT=2 ;;
--only) shift; ONLY="$1" ;;
esac
shift 2>/dev/null || true
done

MODEL="gpt-5.2"
OVERRIDES="evaluation.limit=$LIMIT evaluation.text_only=true model.solver=$MODEL model.judge=$MODEL model.base_url=https://gmn.chuangzuoli.com"

# ── 运行函数 ──────────────────────────────────────────────────────────────────
run_memory() {
local method=$1
local mem_cfg="experiments/configs/memory/${method}.yaml"

echo ""
echo "============================================================"
echo " Memory: $method | Limit: $LIMIT | Model: $MODEL"
echo "============================================================"
echo ""

python experiments/run_experiment.py \
--env "$ENV_CFG" \
--solver "$SOLVER_CFG" \
--tool "$TOOL_CFG" \
--memory "$mem_cfg" \
--override $OVERRIDES
}

# ── 执行 ──────────────────────────────────────────────────────────────────────
METHODS=("generative" "voyager" "memorybank")

if [ -n "$ONLY" ]; then
run_memory "$ONLY"
else
for method in "${METHODS[@]}"; do
run_memory "$method"
done
fi

echo ""
echo "All experiments completed."
14 changes: 7 additions & 7 deletions src/memory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
from .base import MemoryBase
from .methods import (
EmptyMemory,
# GenerativeMASMemory,
# VoyagerMASMemory,
# MemoryBankMASMemory,
GenerativeMASMemory,
VoyagerMASMemory,
MemoryBankMASMemory,
# ChatDevMASMemory,
# MetaGPTMASMemory,
# GMemory,
)

__all__ = [
# "MemoryBase",
"MemoryBase",
"EmptyMemory",
# "GenerativeMASMemory",
# "VoyagerMASMemory",
# "MemoryBankMASMemory",
"GenerativeMASMemory",
"VoyagerMASMemory",
"MemoryBankMASMemory",
# "ChatDevMASMemory",
# "MetaGPTMASMemory",
# "GMemory",
Expand Down
12 changes: 6 additions & 6 deletions src/memory/methods/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
# src/memory/methods/__init__.py
from .empty import EmptyMemory
# from .generative import GenerativeMASMemory
# from .voyager import VoyagerMASMemory
# from .memorybank import MemoryBankMASMemory
from .generative import GenerativeMASMemory
from .voyager import VoyagerMASMemory
from .memorybank import MemoryBankMASMemory
# from .chatdev import ChatDevMASMemory
# from .metagpt import MetaGPTMASMemory
# from .GMemory import GMemory

__all__ = [
"EmptyMemory",
# "GenerativeMASMemory",
# "VoyagerMASMemory",
# "MemoryBankMASMemory",
"GenerativeMASMemory",
"VoyagerMASMemory",
"MemoryBankMASMemory",
# "ChatDevMASMemory",
# "MetaGPTMASMemory",
# "GMemory",
Expand Down
Loading