From 0dd088b9263cd21d713c520aa1557129af5d16b3 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Thu, 21 Oct 2021 12:46:35 -0700 Subject: [PATCH 1/8] auto norm option --- pathmind/simulation.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pathmind/simulation.py b/pathmind/simulation.py index b665c6d..c7298d3 100644 --- a/pathmind/simulation.py +++ b/pathmind/simulation.py @@ -53,9 +53,16 @@ class Simulation: a single value for agent 0, then action[0] will be a float value, otherwise a numpy array with specified shape. You use "action" to apply the next actions to your agents in the "step" function. + + During training, you may optionally provide "reward_weights" for each reward + term. At each step, the reward signal is comprised of a weighted sum of terms. + Reward terms optionally are preprocessed to to yield contributions of + comparable total signal magnitude along each episode. Use "auto_norm_reward". """ action: Dict[int, Union[float, np.ndarray]] = None + reward_weights: List[float] = None + auto_norm_reward: bool = False def __init__(self, *args, **kwargs): """Set any properties and initial states needed for your simulation.""" @@ -175,7 +182,8 @@ def run( writer = csv.writer(out) writer.writerows(result) - def train(self, base_folder: str = "./", observation_yaml: str = None): + def train(self, base_folder: str = "./", observation_yaml: str = None, + reward_weights: List[float] = None, auto_norm_reward: bool = False): """ :param base_folder the path to your base folder containing all your Python code. Defaults to the current working directory, which assumes you start training from the base of your code base. @@ -191,6 +199,9 @@ def train(self, base_folder: str = "./", observation_yaml: str = None): else: obs_yaml = observation_yaml + self.reward_weights = reward_weights + self.auto_norm_reward = auto_norm_reward + token = os.environ.get("PATHMIND_TOKEN") if not token: raise ValueError( @@ -208,6 +219,7 @@ def train(self, base_folder: str = "./", observation_yaml: str = None): -F 'start=true' \ -F 'multiAgent={multi_agent}' \ -F 'obsSelection={obs_yaml}' \ + -F 'useAutoNorm={auto_norm_reward}' \ https://api.pathmind.com/py/upload """ From a7690122e7ccb891a523d54855d6627b31348703 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Thu, 21 Oct 2021 13:22:34 -0700 Subject: [PATCH 2/8] two reward example --- .../two_reward_signal_mouse_env_pathmind.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py diff --git a/tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py b/tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py new file mode 100644 index 0000000..0b7e550 --- /dev/null +++ b/tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py @@ -0,0 +1,56 @@ +import typing + +from pathmind.simulation import Continuous, Discrete, Simulation + + +class TwoRewardMouseAndCheese(Simulation): + + mouse = (0, 0) + cheese = (4, 4) + steps = 0 + + def number_of_agents(self) -> int: + return 1 + + def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]: + return Discrete(4) + + def reset(self) -> None: + self.mouse = (0, 0) + self.cheese = (4, 4) + self.steps = 0 + + def step(self) -> None: + self.steps += 1 + + action = self.action[0] + + if action == 0: # move up + self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1]) + elif action == 1: # move right + self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5)) + elif action == 2: # move down + self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1]) + elif action == 3: # move left + self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0)) + else: + raise ValueError("Invalid action") + + def get_observation(self, agent_id) -> typing.Dict[str, float]: + return { + "mouse_row": float(self.mouse[0]) / 5.0, + "mouse_col": float(self.mouse[1]) / 5.0, + "distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0, + "distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0, + "cheese_row": float(self.cheese[0]) / 5.0, + "cheese_col": float(self.cheese[1]) / 5.0, + } + + def get_reward(self, agent_id) -> typing.Dict[str, float]: + return { + "found_cheese": 1 if self.mouse == self.cheese else 0, + "took_step": -1 + } + + def is_done(self, agent_id) -> bool: + return self.mouse == self.cheese From 05bc03045cad02637ffbee1fb9968b1f582a46b4 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Mon, 25 Oct 2021 12:20:04 -0700 Subject: [PATCH 3/8] Optional --- pathmind/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pathmind/simulation.py b/pathmind/simulation.py index c7298d3..e971aa9 100644 --- a/pathmind/simulation.py +++ b/pathmind/simulation.py @@ -183,7 +183,7 @@ def run( writer.writerows(result) def train(self, base_folder: str = "./", observation_yaml: str = None, - reward_weights: List[float] = None, auto_norm_reward: bool = False): + reward_weights: Optional[List[float]] = None, auto_norm_reward: bool = False): """ :param base_folder the path to your base folder containing all your Python code. Defaults to the current working directory, which assumes you start training from the base of your code base. From c21c0146b6688c8a4029d6b426b40da20be5d955 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Mon, 25 Oct 2021 16:16:13 -0700 Subject: [PATCH 4/8] update mouses --- .../{two_reward_signal_mouse_env_pathmind.py => mouses.py} | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) rename tests/examples/mouse/{two_reward_signal_mouse_env_pathmind.py => mouses.py} (91%) diff --git a/tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py b/tests/examples/mouse/mouses.py similarity index 91% rename from tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py rename to tests/examples/mouse/mouses.py index 0b7e550..7147386 100644 --- a/tests/examples/mouse/two_reward_signal_mouse_env_pathmind.py +++ b/tests/examples/mouse/mouses.py @@ -8,6 +8,8 @@ class TwoRewardMouseAndCheese(Simulation): mouse = (0, 0) cheese = (4, 4) steps = 0 + reward_weights = [1.0, 0.5] + auto_norm_reward = True def number_of_agents(self) -> int: return 1 @@ -47,10 +49,7 @@ def get_observation(self, agent_id) -> typing.Dict[str, float]: } def get_reward(self, agent_id) -> typing.Dict[str, float]: - return { - "found_cheese": 1 if self.mouse == self.cheese else 0, - "took_step": -1 - } + return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1} def is_done(self, agent_id) -> bool: return self.mouse == self.cheese From ecc97ce6bc2aa9175de211a150b3ab3a4d4d74d0 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Mon, 25 Oct 2021 16:43:42 -0700 Subject: [PATCH 5/8] readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0f2e4b9..975aa83 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ See the following two examples to understand the mechanics behind Pathmind's Pyt - **Single Agent Example**: [Mouse chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouse_env_pathmind.py) - **Multi-Agent Example** - [Multiple mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/multi_mouse_env_pathmind.py) +- **Multi-Reward Example** - [Reward balancing mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouses.py) Once you have a `Simulation` ready, you can use it to train a model with Pathmind as follows. @@ -142,3 +143,4 @@ The interface is inspired by OpenAI gym, but differs in certain points: select which ones to use for training in the Pathmind app. - You can rely on Pathmind's scalable backend for training and don't have to run your workloads on your machine. +- Competing reward signals may be optionally weighted and auto-normalized From 53c83744f5a1fe22279bc56ddaed0d01e0af8ffe Mon Sep 17 00:00:00 2001 From: brettskymind Date: Tue, 26 Oct 2021 11:21:55 -0700 Subject: [PATCH 6/8] mouse --- tests/examples/mouse/reward.py | 2 +- .../{mouses.py => two_reward_balance.py} | 0 tests/examples/mouse/two_reward_no_balance.py | 53 +++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) rename tests/examples/mouse/{mouses.py => two_reward_balance.py} (100%) create mode 100644 tests/examples/mouse/two_reward_no_balance.py diff --git a/tests/examples/mouse/reward.py b/tests/examples/mouse/reward.py index 1bdb277..310e4b6 100644 --- a/tests/examples/mouse/reward.py +++ b/tests/examples/mouse/reward.py @@ -1,2 +1,2 @@ -def reward_function(rew: dict) -> float: +def reward_function(rew: dict): return rew["found_cheese"] * 2 diff --git a/tests/examples/mouse/mouses.py b/tests/examples/mouse/two_reward_balance.py similarity index 100% rename from tests/examples/mouse/mouses.py rename to tests/examples/mouse/two_reward_balance.py diff --git a/tests/examples/mouse/two_reward_no_balance.py b/tests/examples/mouse/two_reward_no_balance.py new file mode 100644 index 0000000..124ad63 --- /dev/null +++ b/tests/examples/mouse/two_reward_no_balance.py @@ -0,0 +1,53 @@ +import typing + +from pathmind.simulation import Continuous, Discrete, Simulation + + +class TwoRewardMouseAndCheeseNoBalance(Simulation): + + mouse = (0, 0) + cheese = (4, 4) + steps = 0 + + def number_of_agents(self) -> int: + return 1 + + def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]: + return Discrete(4) + + def reset(self) -> None: + self.mouse = (0, 0) + self.cheese = (4, 4) + self.steps = 0 + + def step(self) -> None: + self.steps += 1 + + action = self.action[0] + + if action == 0: # move up + self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1]) + elif action == 1: # move right + self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5)) + elif action == 2: # move down + self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1]) + elif action == 3: # move left + self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0)) + else: + raise ValueError("Invalid action") + + def get_observation(self, agent_id) -> typing.Dict[str, float]: + return { + "mouse_row": float(self.mouse[0]) / 5.0, + "mouse_col": float(self.mouse[1]) / 5.0, + "distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0, + "distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0, + "cheese_row": float(self.cheese[0]) / 5.0, + "cheese_col": float(self.cheese[1]) / 5.0, + } + + def get_reward(self, agent_id) -> typing.Dict[str, float]: + return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1} + + def is_done(self, agent_id) -> bool: + return self.mouse == self.cheese From cb231de68139d8b6e595fbc10ebf76631c9eb521 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Tue, 26 Oct 2021 11:47:08 -0700 Subject: [PATCH 7/8] update example --- README.md | 2 +- tests/examples/mouse/reward.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 975aa83..215063d 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ See the following two examples to understand the mechanics behind Pathmind's Pyt - **Single Agent Example**: [Mouse chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouse_env_pathmind.py) - **Multi-Agent Example** - [Multiple mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/multi_mouse_env_pathmind.py) -- **Multi-Reward Example** - [Reward balancing mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouses.py) +- **Multi-Reward Example** - [Reward balancing mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/two_reward_balance.py) Once you have a `Simulation` ready, you can use it to train a model with Pathmind as follows. diff --git a/tests/examples/mouse/reward.py b/tests/examples/mouse/reward.py index 310e4b6..1bdb277 100644 --- a/tests/examples/mouse/reward.py +++ b/tests/examples/mouse/reward.py @@ -1,2 +1,2 @@ -def reward_function(rew: dict): +def reward_function(rew: dict) -> float: return rew["found_cheese"] * 2 From ab8b78cf908f34652af00d4a0b305d81ff825877 Mon Sep 17 00:00:00 2001 From: brettskymind Date: Tue, 26 Oct 2021 11:52:06 -0700 Subject: [PATCH 8/8] format --- README.md | 2 +- pathmind/simulation.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e178c64..82446ec 100644 --- a/README.md +++ b/README.md @@ -158,4 +158,4 @@ Run the tests locally: ```shell pre-commit run --all-files pytest -``` \ No newline at end of file +``` diff --git a/pathmind/simulation.py b/pathmind/simulation.py index 13c9026..ce67b84 100644 --- a/pathmind/simulation.py +++ b/pathmind/simulation.py @@ -176,8 +176,13 @@ def run( write_table(table=table, out_csv=out_csv) write_table(table=summary, out_csv=summary_csv) - def train(self, base_folder: str = "./", observation_yaml: str = None, - reward_weights: Optional[List[float]] = None, auto_norm_reward: bool = False): + def train( + self, + base_folder: str = "./", + observation_yaml: str = None, + reward_weights: Optional[List[float]] = None, + auto_norm_reward: bool = False, + ): """ :param base_folder the path to your base folder containing all your Python code. Defaults to the current working directory, which assumes you start training from the base of your code base.