diff --git a/README.md b/README.md index c9666b3..82446ec 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ See the following two examples to understand the mechanics behind Pathmind's Pyt - **Single Agent Example**: [Mouse chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouse_env_pathmind.py) - **Multi-Agent Example** - [Multiple mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/multi_mouse_env_pathmind.py) +- **Multi-Reward Example** - [Reward balancing mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/two_reward_balance.py) Once you have a `Simulation` ready, you can use it to train a model with Pathmind as follows. @@ -142,6 +143,7 @@ The interface is inspired by OpenAI gym, but differs in certain points: select which ones to use for training in the Pathmind app. - You can rely on Pathmind's scalable backend for training and don't have to run your workloads on your machine. +- Competing reward signals may be optionally weighted and auto-normalized ## Development and running tests diff --git a/pathmind/simulation.py b/pathmind/simulation.py index 64fb51f..ce67b84 100644 --- a/pathmind/simulation.py +++ b/pathmind/simulation.py @@ -54,9 +54,16 @@ class Simulation: a single value for agent 0, then action[0] will be a float value, otherwise a numpy array with specified shape. You use "action" to apply the next actions to your agents in the "step" function. + + During training, you may optionally provide "reward_weights" for each reward + term. At each step, the reward signal is comprised of a weighted sum of terms. + Reward terms optionally are preprocessed to to yield contributions of + comparable total signal magnitude along each episode. Use "auto_norm_reward". """ action: Dict[int, Union[float, np.ndarray]] = None + reward_weights: List[float] = None + auto_norm_reward: bool = False def __init__(self, *args, **kwargs): """Set any properties and initial states needed for your simulation.""" @@ -169,7 +176,13 @@ def run( write_table(table=table, out_csv=out_csv) write_table(table=summary, out_csv=summary_csv) - def train(self, base_folder: str = "./", observation_yaml: str = None): + def train( + self, + base_folder: str = "./", + observation_yaml: str = None, + reward_weights: Optional[List[float]] = None, + auto_norm_reward: bool = False, + ): """ :param base_folder the path to your base folder containing all your Python code. Defaults to the current working directory, which assumes you start training from the base of your code base. @@ -185,6 +198,9 @@ def train(self, base_folder: str = "./", observation_yaml: str = None): else: obs_yaml = observation_yaml + self.reward_weights = reward_weights + self.auto_norm_reward = auto_norm_reward + token = os.environ.get("PATHMIND_TOKEN") if not token: raise ValueError( @@ -202,6 +218,7 @@ def train(self, base_folder: str = "./", observation_yaml: str = None): -F 'start=true' \ -F 'multiAgent={multi_agent}' \ -F 'obsSelection={obs_yaml}' \ + -F 'useAutoNorm={auto_norm_reward}' \ https://api.pathmind.com/py/upload """ diff --git a/tests/examples/mouse/two_reward_balance.py b/tests/examples/mouse/two_reward_balance.py new file mode 100644 index 0000000..7147386 --- /dev/null +++ b/tests/examples/mouse/two_reward_balance.py @@ -0,0 +1,55 @@ +import typing + +from pathmind.simulation import Continuous, Discrete, Simulation + + +class TwoRewardMouseAndCheese(Simulation): + + mouse = (0, 0) + cheese = (4, 4) + steps = 0 + reward_weights = [1.0, 0.5] + auto_norm_reward = True + + def number_of_agents(self) -> int: + return 1 + + def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]: + return Discrete(4) + + def reset(self) -> None: + self.mouse = (0, 0) + self.cheese = (4, 4) + self.steps = 0 + + def step(self) -> None: + self.steps += 1 + + action = self.action[0] + + if action == 0: # move up + self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1]) + elif action == 1: # move right + self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5)) + elif action == 2: # move down + self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1]) + elif action == 3: # move left + self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0)) + else: + raise ValueError("Invalid action") + + def get_observation(self, agent_id) -> typing.Dict[str, float]: + return { + "mouse_row": float(self.mouse[0]) / 5.0, + "mouse_col": float(self.mouse[1]) / 5.0, + "distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0, + "distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0, + "cheese_row": float(self.cheese[0]) / 5.0, + "cheese_col": float(self.cheese[1]) / 5.0, + } + + def get_reward(self, agent_id) -> typing.Dict[str, float]: + return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1} + + def is_done(self, agent_id) -> bool: + return self.mouse == self.cheese diff --git a/tests/examples/mouse/two_reward_no_balance.py b/tests/examples/mouse/two_reward_no_balance.py new file mode 100644 index 0000000..124ad63 --- /dev/null +++ b/tests/examples/mouse/two_reward_no_balance.py @@ -0,0 +1,53 @@ +import typing + +from pathmind.simulation import Continuous, Discrete, Simulation + + +class TwoRewardMouseAndCheeseNoBalance(Simulation): + + mouse = (0, 0) + cheese = (4, 4) + steps = 0 + + def number_of_agents(self) -> int: + return 1 + + def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]: + return Discrete(4) + + def reset(self) -> None: + self.mouse = (0, 0) + self.cheese = (4, 4) + self.steps = 0 + + def step(self) -> None: + self.steps += 1 + + action = self.action[0] + + if action == 0: # move up + self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1]) + elif action == 1: # move right + self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5)) + elif action == 2: # move down + self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1]) + elif action == 3: # move left + self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0)) + else: + raise ValueError("Invalid action") + + def get_observation(self, agent_id) -> typing.Dict[str, float]: + return { + "mouse_row": float(self.mouse[0]) / 5.0, + "mouse_col": float(self.mouse[1]) / 5.0, + "distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0, + "distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0, + "cheese_row": float(self.cheese[0]) / 5.0, + "cheese_col": float(self.cheese[1]) / 5.0, + } + + def get_reward(self, agent_id) -> typing.Dict[str, float]: + return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1} + + def is_done(self, agent_id) -> bool: + return self.mouse == self.cheese