Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ See the following two examples to understand the mechanics behind Pathmind's Pyt

- **Single Agent Example**: [Mouse chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouse_env_pathmind.py)
- **Multi-Agent Example** - [Multiple mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/multi_mouse_env_pathmind.py)
- **Multi-Reward Example** - [Reward balancing mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/two_reward_balance.py)

Once you have a `Simulation` ready, you can use it to train a model with Pathmind
as follows.
Expand Down Expand Up @@ -142,6 +143,7 @@ The interface is inspired by OpenAI gym, but differs in certain points:
select which ones to use for training in the Pathmind app.
- You can rely on Pathmind's scalable backend for training and don't have to
run your workloads on your machine.
- Competing reward signals may be optionally weighted and auto-normalized

## Development and running tests

Expand Down
19 changes: 18 additions & 1 deletion pathmind/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,16 @@ class Simulation:
a single value for agent 0, then action[0] will be a float value, otherwise
a numpy array with specified shape. You use "action" to apply the next actions
to your agents in the "step" function.

During training, you may optionally provide "reward_weights" for each reward
term. At each step, the reward signal is comprised of a weighted sum of terms.
Reward terms optionally are preprocessed to to yield contributions of
comparable total signal magnitude along each episode. Use "auto_norm_reward".
"""

action: Dict[int, Union[float, np.ndarray]] = None
reward_weights: List[float] = None
auto_norm_reward: bool = False

def __init__(self, *args, **kwargs):
"""Set any properties and initial states needed for your simulation."""
Expand Down Expand Up @@ -169,7 +176,13 @@ def run(
write_table(table=table, out_csv=out_csv)
write_table(table=summary, out_csv=summary_csv)

def train(self, base_folder: str = "./", observation_yaml: str = None):
def train(
self,
base_folder: str = "./",
observation_yaml: str = None,
reward_weights: Optional[List[float]] = None,
auto_norm_reward: bool = False,
):
"""
:param base_folder the path to your base folder containing all your Python code. Defaults to the current
working directory, which assumes you start training from the base of your code base.
Expand All @@ -185,6 +198,9 @@ def train(self, base_folder: str = "./", observation_yaml: str = None):
else:
obs_yaml = observation_yaml

self.reward_weights = reward_weights
self.auto_norm_reward = auto_norm_reward

token = os.environ.get("PATHMIND_TOKEN")
if not token:
raise ValueError(
Expand All @@ -202,6 +218,7 @@ def train(self, base_folder: str = "./", observation_yaml: str = None):
-F 'start=true' \
-F 'multiAgent={multi_agent}' \
-F 'obsSelection={obs_yaml}' \
-F 'useAutoNorm={auto_norm_reward}' \
https://api.pathmind.com/py/upload
"""

Expand Down
55 changes: 55 additions & 0 deletions tests/examples/mouse/two_reward_balance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import typing

from pathmind.simulation import Continuous, Discrete, Simulation


class TwoRewardMouseAndCheese(Simulation):

mouse = (0, 0)
cheese = (4, 4)
steps = 0
reward_weights = [1.0, 0.5]
auto_norm_reward = True

def number_of_agents(self) -> int:
return 1

def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]:
return Discrete(4)

def reset(self) -> None:
self.mouse = (0, 0)
self.cheese = (4, 4)
self.steps = 0

def step(self) -> None:
self.steps += 1

action = self.action[0]

if action == 0: # move up
self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1])
elif action == 1: # move right
self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5))
elif action == 2: # move down
self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1])
elif action == 3: # move left
self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0))
else:
raise ValueError("Invalid action")

def get_observation(self, agent_id) -> typing.Dict[str, float]:
return {
"mouse_row": float(self.mouse[0]) / 5.0,
"mouse_col": float(self.mouse[1]) / 5.0,
"distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0,
"distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0,
"cheese_row": float(self.cheese[0]) / 5.0,
"cheese_col": float(self.cheese[1]) / 5.0,
}

def get_reward(self, agent_id) -> typing.Dict[str, float]:
return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1}

def is_done(self, agent_id) -> bool:
return self.mouse == self.cheese
53 changes: 53 additions & 0 deletions tests/examples/mouse/two_reward_no_balance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import typing

from pathmind.simulation import Continuous, Discrete, Simulation


class TwoRewardMouseAndCheeseNoBalance(Simulation):

mouse = (0, 0)
cheese = (4, 4)
steps = 0

def number_of_agents(self) -> int:
return 1

def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]:
return Discrete(4)

def reset(self) -> None:
self.mouse = (0, 0)
self.cheese = (4, 4)
self.steps = 0

def step(self) -> None:
self.steps += 1

action = self.action[0]

if action == 0: # move up
self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1])
elif action == 1: # move right
self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5))
elif action == 2: # move down
self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1])
elif action == 3: # move left
self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0))
else:
raise ValueError("Invalid action")

def get_observation(self, agent_id) -> typing.Dict[str, float]:
return {
"mouse_row": float(self.mouse[0]) / 5.0,
"mouse_col": float(self.mouse[1]) / 5.0,
"distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0,
"distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0,
"cheese_row": float(self.cheese[0]) / 5.0,
"cheese_col": float(self.cheese[1]) / 5.0,
}

def get_reward(self, agent_id) -> typing.Dict[str, float]:
return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1}

def is_done(self, agent_id) -> bool:
return self.mouse == self.cheese