Skip to content
Merged
6 changes: 3 additions & 3 deletions molmo_spaces/env/object_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,9 +1614,9 @@ def clear(self):
def get_body_to_geoms(self):
body_to_geom_ids = defaultdict(set)
for geom_id in range(0, self.model.ngeom):
body_id = self.model.geom(geom_id).bodyid
root_id = self.model.body(body_id).rootid
body_to_geom_ids[int(root_id)].add(int(geom_id))
body_id = int(self.model.geom(geom_id).bodyid.item())
root_id = int(self.model.body(body_id).rootid.item())
body_to_geom_ids[root_id].add(geom_id)
return {
key: sorted(values)
for key, values in body_to_geom_ids.items()
Expand Down
9 changes: 8 additions & 1 deletion molmo_spaces/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ This README focuses on benchmark installation and running.
- Submitting results, see the GitHub issue in the repository [here](https://github.com/allenai/molmospaces/issues/8).
- Theoretical notes on policy comparison can be found [here](https://docs.google.com/document/d/1FcMxJgAQ_2Ojd2uu8HE2MBfD6RE53zcXa55_r8EfPts/export?format=pdf)


## Concepts

The MolmoSpaces **leaderboard** shows the results of various polices on benchmarks.
Expand Down Expand Up @@ -75,6 +74,10 @@ uv run scripts/serve_policy.py --port=8080 policy:checkpoint \

#### 2. Run the benchmark

Please look at the concrete commands for each task type in our [leaderboard](https://molmospaces.allen.ai/leaderboard):
- MolmoSpaces tasks (`MS-` prefix): [ms-bench](ms-bench.md)
- MolmoBot tasks (`MB-` prefix): [mb-bench](mb-bench.md)

If using OpenPI models: `pip install openpi_client`.

For this we chose the easy `MS-Pick` benchmark, which is located here `assets/benchmarks/molmospaces-bench-v1/procthor-10k/FrankaPickDroidMiniBench/FrankaPickDroidMiniBench_json_benchmark_20251231/`.
Expand Down Expand Up @@ -197,6 +200,10 @@ class MyEvalConfig(JsonBenchmarkEvalConfig):

### 4. Run Evaluation

Please look at the concrete commands for each task type in our [leaderboard](https://molmospaces.allen.ai/leaderboard):
- MolmoSpaces tasks (`MS-` prefix): [ms-bench](ms-bench.md)
Comment thread
BlGene marked this conversation as resolved.
- MolmoBot tasks (`MB-` prefix): [mb-bench](mb-bench.md)

Command line:

```bash
Expand Down
26 changes: 24 additions & 2 deletions molmo_spaces/evaluation/eval_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,14 @@ def get_args():
"--max_episodes",
type=int,
default=None,
help="Maximum number of episodes to evaluate from benchmark. If None, evaluates all episodes.",
help="Limit number of episodes to evaluate from benchmark. If None, evaluates all episodes, else, evaluates only the episodes for the houses used in the first `max_episodes`. Note that the final number of episodes can differ from `max_episodes` if more than one episode is sampled for any of the houses among the first `max_episodes` episodes.",
)
parser.add_argument(
"--camera_names",
type=str,
nargs="+",
default=None,
help="Override policy_config.camera_names (e.g. --camera_names randomized_zed2_analogue_1 wrist_camera).",
)

# Eval camera randomization flags (shared across all JSON eval entry points)
Expand Down Expand Up @@ -354,6 +361,7 @@ class EvalRuntimeParams:
"""

episode_idx: int | None = None
max_episodes: int | None = None
add_custom_object: bool = False
custom_object_path: str | Path | None = None
custom_object_name: str | None = None
Expand Down Expand Up @@ -442,6 +450,8 @@ def run_evaluation(
preloaded_policy: BasePolicy | None = None,
max_episodes: int | None = None,
camera_config_override: Any | None = None,
camera_names_override: list[str] | None = None,
use_filament: bool = False,
environment_light_intensity: float | None = None,
episode_idx: int | None = None,
add_custom_object: bool = False,
Expand Down Expand Up @@ -469,6 +479,8 @@ def run_evaluation(
max_episodes: Maximum number of episodes to evaluate from benchmark. If None, evaluates all episodes.
camera_config_override: Optional camera system config (e.g. FrankaEvalCameraSystem) to
replace the default camera_config on the experiment config.
camera_names_override: Optional list of camera names to override
policy_config.camera_names (e.g. ["randomized_zed2_analogue_1", "wrist_camera"]).
episode_idx: Index of a specific episode to evaluate. If None, evaluates all episodes.
add_custom_object: Whether to replace the target object with a custom object.
custom_object_path: Path to the custom object XML file. Required if add_custom_object is True.
Expand Down Expand Up @@ -604,15 +616,22 @@ def run_evaluation(
camera_config_override=camera_config_override,
)

# Custom filmanet settings to overwrite by the user
# Custom filament settings to overwrite by the user
exp_config.use_filament |= use_filament
exp_config.environment_light_intensity = (
environment_light_intensity or exp_config.environment_light_intensity
)

# Override policy camera names if requested
if camera_names_override is not None:
log.info(f"Overriding policy_config.camera_names: {camera_names_override}")
exp_config.policy_config.camera_names = camera_names_override

# Patch config with evaluation-specific runtime parameters
exp_config = JsonEvalRunner.patch_config(
exp_config=exp_config,
episode_idx=episode_idx,
max_episodes=max_episodes,
add_custom_object=add_custom_object,
custom_object_path=custom_object_path,
custom_object_name=custom_object_name,
Expand Down Expand Up @@ -729,8 +748,11 @@ def main() -> None:
num_workers=args.num_workers,
use_wandb=not args.no_wandb,
wandb_project=args.wandb_project,
max_episodes=args.max_episodes,
use_filament=args.use_filament,
environment_light_intensity=args.environment_light_intensity,
camera_config_override=eval_camera_config,
camera_names_override=args.camera_names,
episode_idx=args.idx,
add_custom_object=args.add_custom_object,
custom_object_path=args.custom_object_path,
Expand Down
23 changes: 20 additions & 3 deletions molmo_spaces/evaluation/json_eval_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class JsonEvalRunner(ParallelRolloutRunner):
def patch_config(
exp_config: MlSpacesExpConfig,
episode_idx: int | None = None,
max_episodes: int | None = None,
add_custom_object: bool = False,
custom_object_path: str | Path | None = None,
custom_object_name: str | None = None,
Expand All @@ -69,6 +70,11 @@ def patch_config(
exp_config: The experiment config to patch
episode_idx: Optional index of a specific episode to evaluate. If provided,
only that episode will be evaluated and the process will stop after it.
max_episodes: Optional maximum number of episodes to evaluate. If provided,
only the episodes for the houses used in the first N episodes will be
evaluated. Note that the final number of episodes can differ from N
if more than one episode is sampled for any of the houses among the
first N episodes.
add_custom_object: Whether to replace the target object with a custom object.
custom_object_path: Path to the custom object XML file. Required if
add_custom_object is True.
Expand All @@ -89,6 +95,7 @@ def patch_config(
# eval_runtime_params is now a proper field in MlSpacesExpConfig, so normal assignment works
exp_config.eval_runtime_params = EvalRuntimeParams(
episode_idx=episode_idx,
max_episodes=max_episodes,
add_custom_object=add_custom_object,
custom_object_path=custom_object_path,
custom_object_name=custom_object_name,
Expand Down Expand Up @@ -127,13 +134,19 @@ def __init__(
f"Expected benchmark.json file with list of episode specs."
)

eval_params = exp_config.eval_runtime_params
if eval_params.max_episodes is not None and len(all_episodes) > eval_params.max_episodes:
log.info(
f"Limiting to first {eval_params.max_episodes} of {len(all_episodes)} episodes"
)
all_episodes = all_episodes[: eval_params.max_episodes]

self._episodes_by_house: dict[int, list[EpisodeSpec]] = defaultdict(list)
for ep in all_episodes:
self._episodes_by_house[ep.house_index].append(ep)
self._episodes_by_house = dict(self._episodes_by_house)

# If episode_idx is specified, only process the house containing that episode
eval_params = exp_config.eval_runtime_params
episode_idx = eval_params.episode_idx
if episode_idx is not None:
if episode_idx < 0 or episode_idx >= len(all_episodes):
Expand Down Expand Up @@ -190,8 +203,13 @@ def load_episodes_for_house(
)
return [], None

# Filter by episode index if specified
eval_params = exp_config.eval_runtime_params

# Truncate to max_episodes before any filtering
if eval_params.max_episodes is not None and len(all_episodes) > eval_params.max_episodes:
all_episodes = all_episodes[: eval_params.max_episodes]

# Filter by episode index if specified
episode_idx = eval_params.episode_idx
if episode_idx is not None:
if episode_idx < 0 or episode_idx >= len(all_episodes):
Expand All @@ -217,7 +235,6 @@ def load_episodes_for_house(
return [], None

# Apply custom object replacement if requested
eval_params = exp_config.eval_runtime_params
add_custom_object = eval_params.add_custom_object
custom_object_path = eval_params.custom_object_path
custom_object_name = eval_params.custom_object_name
Expand Down
93 changes: 93 additions & 0 deletions molmo_spaces/evaluation/mb-bench.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# MolmoBot Benchmarks

## Usage

We first run an evaluation like
```bash
python molmo_spaces/evaluation/eval_main.py \
<YOUR_POLICY_CONFIG> \
[OPTIONS] \
--benchmark_dir <BENCHMARK_DIR> \
--output_dir <eval_output_dir>
```
Please see detailed commands for each task type below, and replace `<YOUR_POLICY_CONFIG>` with your evaluation config (e.g. `molmo_spaces.evaluation.configs.evaluation_configs:PiPolicyEvalConfig`).

Finally, run the evaluation output script that aggregates results as csv files:
```bash
python scripts/benchmarks/eval_to_csv.py \
<eval_output_dir>/<date_str> \
<policy_name> \
--success-condition both \
--output-csv /eg/path/to/<task_type>/<policy_name>.csv
```

## Benchmarks with classic renderer

For benchmarks using classic renderer we need to install the `mujoco` version from [our dependencies](../../pyproject.toml), e.g., by calling
```bash
pip install -e ".[mujoco]"
```
from the project root directory.

### Pick-MSProc (Pick-v1.5)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-10k/FrankaPickDroidMiniBench/FrankaPickDroidMiniBench_json_benchmark_20251231
```

### Pick-Classic (Pick-v2-classic)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickHardBench/FrankaPickHardBench_20260206_json_benchmark
```

## Benchmarks with filament renderer

For benchmarks using filament we should install `mujoco-filament` from [our dependencies](../../pyproject.toml), e.g., by calling
```bash
pip install -e ".[mujoco-filament]"
```
from the project root directory and pass the `--use-filament` option to the evaluation script.

### Pick-Filament (Pick-v2-filament)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--use-filament \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickHardBench/FrankaPickHardBench_20260206_json_benchmark
```

### Pick-RandCam (Pick-v2-rand-cam)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--use-filament \
--camera_names randomized_zed2_analogue_1 wrist_camera_zed_mini \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickHardBench/FrankaPickHardBench_20260206_json_benchmark
```

### Pick & Place (PnP-v2)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--use-filament \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickandPlaceHardBench/FrankaPickandPlaceHardBench_20260206_json_benchmark
```

### Pick & Place-NextTo (PnP-next-to-v2)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--use-filament \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickandPlaceNextToHardBench/FrankaPickandPlaceNextToHardBench_20260305_json_benchmark
```

### Pick & Place-Color (PnP-color-v2)

```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
--use-filament \
--benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickandPlaceColorHardBench/FrankaPickandPlaceColorHardBench_20260304_json_benchmark
```
26 changes: 19 additions & 7 deletions molmo_spaces/evaluation/ms-bench.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,48 @@

## Usage

We first run an evaluation like
```bash
python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> --benchmark_dir <BENCHMARK_DIR>
python molmo_spaces/evaluation/eval_main.py \
<YOUR_POLICY_CONFIG> \
--benchmark_dir <BENCHMARK_DIR> \
--output_dir <eval_output_dir>
```
Please see detailed commands for each task type below, and replace `<YOUR_POLICY_CONFIG>` with your evaluation config (e.g. `molmo_spaces.evaluation.configs.evaluation_configs:PiPolicyEvalConfig`).

Finally, run the evaluation output script that aggregates results as csv files:
```bash
python scripts/benchmarks/eval_to_csv.py \
<eval_output_dir>/<date_str> \
<policy_name> \
--success-condition both \
--output-csv /eg/path/to/<task_type>/<policy_name>.csv
```

Replace `<YOUR_POLICY_CONFIG>` with your evaluation config (e.g. `molmo_spaces.evaluation.configs.evaluation_configs:PiPolicyEvalConfig`).

## Benchmarks

### Close
### Close (Close-v1)

```bash
python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
--benchmark_dir assets/benchmarks/molmospaces-bench-v1/ithor/FrankaCloseDataGenConfig/FrankaCloseDataGenConfig_20260123_json_benchmark
```

### Open
### Open (Open-v1)

```bash
python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
--benchmark_dir assets/benchmarks/molmospaces-bench-v1/ithor/FrankaOpenDataGenConfig/FrankaOpenDataGenConfig_20260123_json_benchmark
```

### Pick
### Pick (Pick-v1.1)

```bash
python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
--benchmark_dir assets/benchmarks/molmospaces-bench-v1/procthor-10k/FrankaPickDroidMiniBench/FrankaPickDroidMiniBench_json_benchmark_20251231
```

### Pick and Place
### Pick and Place (PnP-v1)

```bash
python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
Expand Down
2 changes: 1 addition & 1 deletion molmo_spaces/molmo_spaces_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def resource_manager_log_level(log_level=logging.DEBUG):
},
benchmarks={
"molmospaces-bench-v1": "20260408",
"molmospaces-bench-v2": "20240407",
"molmospaces-bench-v2": "20260415",
Comment thread
BlGene marked this conversation as resolved.
},
)

Expand Down
Loading