diff --git a/Macodiac.ML/.gitignore b/Macodiac.ML/.gitignore index 197028e..32fa32c 100644 --- a/Macodiac.ML/.gitignore +++ b/Macodiac.ML/.gitignore @@ -1,7 +1,16 @@ training/results/logs/* +training_multiagent/results/logs/* +!training_multiagent/results/logs/.keep !training/results/logs/.keep training/results/saved_models/* !training/results/saved_models/.keep +training_multiagent/results/saved_models/* +!training_multiagent/results/saved_models/.keep +training_multiagent_16may/results/logs/* +training_multiagent_16may/results/saved_models/* +training_multiagent/results_price_setter/logs/* +training_multiagent/results_price_setter/saved_models/* + # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Macodiac.ML/environment.py b/Macodiac.ML/environment.py index fa1d065..498513f 100644 --- a/Macodiac.ML/environment.py +++ b/Macodiac.ML/environment.py @@ -41,7 +41,6 @@ def step(self, action): self.state += action - 1 self.environment_timesteps -=1 - if self.state > 0: reward = 1 elif self.state == 0: diff --git a/Macodiac.ML/main.py b/Macodiac.ML/main.py index 829146b..1619c2f 100644 --- a/Macodiac.ML/main.py +++ b/Macodiac.ML/main.py @@ -2,6 +2,7 @@ import gymnasium as gym from gymnasium import Env from environment import MacodiacEnvironment +from multiagentenvironment import MultiAgentMacodiacEnvironment from stable_baselines3.common.evaluation import evaluate_policy import numpy as np @@ -22,7 +23,7 @@ def __init__(self): self.save_path = os.path.join(filePath,'saved_models', 'model') self.save_path_intermittent = os.path.join(filePath,'saved_models', 'intermittent_saved_models') self.env = MacodiacEnvironment(envTimesteps=100) - self.numTrainingIterations = 10_000_000 + self.numTrainingIterations = 10_000 self.numEpisodes = 10 @@ -31,7 +32,7 @@ def __init__(self): # NOTES: # if loadmodel is set to false, and trainmodel is set to true, # the currently saved model is overwritten - self.__MODE_LOADMODEL__ = True + self.__MODE_LOADMODEL__ =False # set to true if you want to train and then save the model self.__MODE_TRAINMODEL__ = True diff --git a/Macodiac.ML/md_multiagent_main.py b/Macodiac.ML/md_multiagent_main.py new file mode 100644 index 0000000..9c307c9 --- /dev/null +++ b/Macodiac.ML/md_multiagent_main.py @@ -0,0 +1,207 @@ +import os +import gymnasium as gym +from gymnasium import Env +from environment import MacodiacEnvironment +from md_multiagentenvironment import MdMultiAgentMacodiacEnvironment +from stable_baselines3.common.evaluation import evaluate_policy +from stable_baselines3.common.env_checker import check_env +import numpy as np +import random + + + +from stable_baselines3 import PPO + + +class MultiagentMain(): + isRunning = False + + def __init__(self): + """ + init the class + """ + filePath = os.path.join('Macodiac.ML', 'training_multiagent','results') + self.log_path = os.path.join(filePath,'Logs') + self.save_path = os.path.join(filePath,'saved_models', 'model') + self.save_path_intermittent = os.path.join(filePath,'saved_models', 'intermittent_saved_models') + self.numTrainingIterations = 10 + self.numEpisodes = 10 + self.numAgents = 3 + + self.env = MdMultiAgentMacodiacEnvironment(envTimesteps=15, numAgents=self.numAgents) + check_env(self.env) + + + # set to true if you want to load an existing model + # model loading happens first, then training + # NOTES: + # if loadmodel is set to false, and trainmodel is set to true, + # the currently saved model is overwritten + self.__MODE_LOADMODEL__ = False + + # set to true if you want to train and then save the model + self.__MODE_TRAINMODEL__ = True + + # set to true to use the randomsample mode for testing, + # rather than the model version + self.__MODE_RANDOMSAMPLE__ = False + + + def Run(self): + """ + Runs the project + """ + + if self.__MODE_RANDOMSAMPLE__: + self.run_multiagent_project_with_rand_test(self.env, 5) + + model = self.create_model(self.env, self.log_path) + + if self.__MODE_LOADMODEL__: + model = self.load_model(self.env, model, self.save_path) + + if self.__MODE_TRAINMODEL__: + model = self.train_model(model, + self.numTrainingIterations, self.save_path_intermittent) + self.save_model(model, self.save_path) + + + else: + self.run_project(self.env, self.numEpisodes, model) + self.policy_evaluation(model, self.env, self.numEpisodes) + + + def run_multiagent_project_with_rand_test(self, env:MdMultiAgentMacodiacEnvironment, numEpisodes: int): + """ + Runs the project with random sampling, using the multiagent env + """ + + for episode in range(numEpisodes): + obs = env.reset() + done = False + score = 0 + agent_scores = [] + iterator = 0 + while not done: + #env.render() + iterator+=1 + print(f'iterator:{iterator}') + action_arr = [] + for i in range(len(env.policy_agents)): + action_arr.append(random.randint(0,2)) + # agentActionSpace = env.action_space[i] + # actionForAgent = agentActionSpace.sample() + # action_arr.append(actionForAgent) + + print(f'action for agents:\t{action_arr}') + + obs_arr, reward_arr, done_arr, isTerminal, info_arr = env.step(action_arr) + + # for i, reward in enumerate(reward_arr): + # print(f'reward is {reward}') + + agent_scores.append(sum(reward_arr)) + + print(f'rewards for agents:\t{reward_arr}') + + if any(done_arr): + isTerminal = True + + done = isTerminal + print(f'Episode:{episode} | Aggregate agent scores:(Sum:{sum(agent_scores)})') + env.close() + + + + def run_project(self, env:MdMultiAgentMacodiacEnvironment, numEpisodes: int, model): + """ + Runs the project with an actual model, instead of random sampling + of a model + + @param env: The environment to run this project with + @param numEpisodes: the count of episodes to run the environment for + """ + scores = [] + for episode in range(numEpisodes): + obs = env.reset() + done = False + score = 0 + while not done: + #env.render() + action, _discard = model.predict(obs) + obs, reward, isTerminal, info = env.step(action) + score += reward + done = isTerminal + scores.append(score) + + runningAvg = np.mean(scores) + + print(f'Episode:{episode} \t| Score:{score} \t\t| RunningAvg: {round(runningAvg, 2)}') + env.close() + + + def create_model(self, env: MdMultiAgentMacodiacEnvironment, log_path: str): + env.reset() + model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path) + return model + + + def train_model(self, model, numTimesteps: int, savePath:str): + """ + Trains a model with the number of iterations in + numtimesteps. Creates a n intermediate save every 1m iterations + + @param model: The model to train. The model must have been instantiated + @param numTimesteps: the number of training iterations + """ + + saveEveryNSteps = 1_000_000 + + if numTimesteps < saveEveryNSteps: + model.learn(total_timesteps=numTimesteps) + + else: + rangeUpper = int(numTimesteps / saveEveryNSteps) + for i in range(1,rangeUpper+1): + model.learn(total_timesteps=saveEveryNSteps) + model.save(os.path.join(savePath, f'interim-{i}')) + + return model + + def policy_evaluation(self, model, env: MdMultiAgentMacodiacEnvironment, numEpisodes:int=50): + """ + Prints a policy evaluation, including the mean episode reward + and the standard deviation + + @param model: The model to be evaluated + @param env: The environment to evaluate the model against + @param numEpisodes: The count of episodes to evaluate against + """ + print('\nevalResult:(mean episode reward, standard deviation)') + print(f'evalResult:{evaluate_policy(model, env, n_eval_episodes=numEpisodes)}\n') + + + + def save_model(self, model, modelPath): + """ + Saves a model to a given path + + @param model: The model to save + @param modelPath: The path to save to + """ + model.save(modelPath) + + + def load_model(self, env: MdMultiAgentMacodiacEnvironment, model, modelPath: str): + """ + Saves a model to a given path + + @param model: The model to save + @param modelPath: The modelPath to save to + """ + model = PPO.load(modelPath, env=env) + return model + + +main = MultiagentMain() +main.Run() \ No newline at end of file diff --git a/Macodiac.ML/md_multiagentenvironment.py b/Macodiac.ML/md_multiagentenvironment.py new file mode 100644 index 0000000..6c23e8a --- /dev/null +++ b/Macodiac.ML/md_multiagentenvironment.py @@ -0,0 +1,166 @@ +import gymnasium as gym +from gym import Env +# use `gym.spaces` here, even though we're using `gymnasium` +# https://stackoverflow.com/questions/75108957/assertionerror-the-algorithm-only-supports-class-gym-spaces-box-box-as-acti +from gym import spaces +import numpy as np +import random + +class AgentObject: + def __init__(self): + self.state = [] + +class MdMultiAgentMacodiacEnvironment(Env): + """ + Builds a profit maximising agent environment, supporting + up to n_agent agents + """ + state = 0 + environment_timesteps = 15 + + def __init__(self, envTimesteps:int, numAgents: int): + """ + Initialises the class + """ + self.environment_timesteps = envTimesteps + self.policy_agents = [] + for i in range(numAgents): + self.policy_agents.append(AgentObject()) + + #self_action_space = [] + #self.observation_space = [] + self.agents = [numAgents] + mdActionSpace_arr = [] + mdObservationSpace_arr = [] + + for agent in self.policy_agents: + mdActionSpace_arr.append(3) + mdObservationSpace_arr.append(10) + #md_action_space = spaces.MultiDiscrete(np.array([3,3]), seed=42) + #self_action_space.append(spaces.Discrete(3)) + #self_observation_space.append(spaces.Box(low=np.array([0]), high=np.array([100]))) + + #self.observation_space.append(spaces.Box(low=np.array([0]), high=np.array([100]))) + + md_action_space = spaces.MultiDiscrete(np.array(mdActionSpace_arr))#, seed=42) + md_observation_space = spaces.MultiDiscrete(np.array(mdObservationSpace_arr))#, seed=42) + self.action_space = md_action_space + self.observation_space = md_observation_space + # self.action_space = np.array(self_action_space) + # self.observation_space = self_observation_space + self.reset() + + + + print('-- ENV SETTINGS --') + print(self.observation_space) + #print(self.observation_space.sample()) + print(self.action_space) + #print(self.action_space.sample()) + print(self.environment_timesteps) + print('-- ENV SETTINGS --') + + + def set_agent_action(self, action, agent, actionSpace): + agent.state = action + + def step_agent(self, agent): + myState = agent.state - 1 + if myState > 0: + reward = 1 + elif myState == 0: + reward = 0 + else: + reward = -1 + + info = {} + return agent.state, reward, False, info + + + def step(self, action_arr): + """ + Processes an action for an agent. + + Loops through each agent and sets its action. + Then calls world.step to progress the entire world's actions + + Builds up arrays of results, and returns them in a tuple of arrays + + """ + self.environment_timesteps -=1 + + obs_arr = [] + reward_arr = [] + done_arr = [] + info_arr = [{'n': []}] + + agent_arr = self.policy_agents + + for i, agent in enumerate(agent_arr): + self.set_agent_action(action_arr[i], agent, self.action_space[i]) + + for i, agent in enumerate(agent_arr): + agent.state, agent.reward, agent.done, agent.info = self.step_agent(agent) + + for agent in self.policy_agents: + obs_arr.append(self._get_obs(agent)) + reward_arr.append(self._get_reward(agent)) + done_arr.append(self._get_done(agent)) + info_arr.append(self._get_info(agent)) + + if self.environment_timesteps <= 0: + isTerminal = True + elif any(done_arr): + isTerminal = True + else: + isTerminal = False + + return obs_arr, reward_arr, done_arr, isTerminal, info_arr + + + def _get_obs(self, agent): + """ + accepts an Agent, and returns its observation/state + """ + return agent.state + + def _get_reward(self, agent): + """ + accepts an Agent, and returns its reward + """ + return agent.reward + + def _get_done(self, agent): + """ + accepts an Agent, and returns its done/terminal property + """ + return agent.done + + def _get_info(self, agent): + """ + accepts an Agent, and returns its info object + """ + return agent.info + + + def render(self) -> None: + """ + Does nothing, the environment is fully headless + """ + pass + + def reset(self) -> float: + """ + Sets the application to its initial conditions + + Sets state to a random float between negative 100 to positive 100 + """ + for i in range(len(self.policy_agents)): + self.policy_agents[i].state = np.array([0 + random.randint(0,10)]).astype(float) + self.policy_agents[i].reward = 0 + self.policy_agents[i].info = {} + self.policy_agents[i].done = False + + self.environment_timesteps = 10 + return self.environment_timesteps + \ No newline at end of file diff --git a/Macodiac.ML/multiagent_main.py b/Macodiac.ML/multiagent_main.py new file mode 100644 index 0000000..cfcd4d0 --- /dev/null +++ b/Macodiac.ML/multiagent_main.py @@ -0,0 +1,267 @@ +import os +import gymnasium as gym +from gymnasium import Env +from multiagentenvironment import TensorboardPriceCallback +from multiagentenvironment import MultiAgentMacodiacEnvironment +from stable_baselines3.common.evaluation import evaluate_policy +from stable_baselines3.common.env_checker import check_env +import numpy as np +from stable_baselines3 import PPO + + +class MultiagentMain(): + isRunning = False + + def __init__(self, mode): + """ + init the class + """ + + # set to true if you want to load an existing model + # model loading happens first, then training + # NOTES: + # if loadmodel is set to false, and trainmodel is set to true, + # the currently saved model is overwritten + self.__MODE_LOADMODEL__ = False + + # set to true if you want to train and then save the model + self.__MODE_TRAINMODEL__ = True + + # set to true to use the randomsample mode for testing, + # rather than the model version + self.__MODE_RANDOMSAMPLE__ = False + + self.mode = mode + + + filePath = os.path.join('Macodiac.ML', 'training_multiagent','results') + self.log_path = os.path.join(filePath,'Logs') + self.save_path = os.path.join(filePath,'saved_models', self.mode) + self.save_path_intermittent = os.path.join(filePath,'saved_models', 'intermittent_saved_models') + + self.numEpisodes = 20 + self.envTimesteps = 25 + + if self.mode == 'MONOPOLY': + self.numAgents = 1 + self.numTrainingIterations = 2_000_000 + elif self.mode == 'DUOPOLY': + self.numAgents = 2 + self.numTrainingIterations = 3_000_000 + elif self.mode == 'OLIGOPOLY': + self.numAgents = 5 + self.numTrainingIterations = 5_000_000 + elif self.mode == 'PERFECT_COMP': + self.numAgents = 10 + self.numTrainingIterations = 15_000_000 + else: + raise ValueError(f'self.mode [{self.mode}] was not in mode options list [{self.__MODE_OPTIONS__}]') + + if self.numAgents == 0 or self.numTrainingIterations == 0: + raise ValueError('both numAgents and numTrainingItterations must be above 0') + + self.env = MultiAgentMacodiacEnvironment(envTimesteps=self.envTimesteps, numAgents=self.numAgents) + check_env(self.env) + + + def Run(self): + """ + Runs the project + """ + if self.__MODE_RANDOMSAMPLE__: + self.run_multiagent_project_with_rand_test(self.env, self.numEpisodes) + + model = self.create_model(self.env, self.log_path) + + if self.__MODE_LOADMODEL__: + model = self.load_model(self.env, model, self.save_path) + + if self.__MODE_TRAINMODEL__: + + model = self.train_model(model, + self.numTrainingIterations, + self.save_path_intermittent, + self.mode) + self.save_model(model, self.save_path) + + if not self.__MODE_RANDOMSAMPLE__: + self.run_project(self.env, self.numEpisodes, model) + self.policy_evaluation(model, self.env, self.numEpisodes) + + + def run_multiagent_project_with_rand_test(self, env:MultiAgentMacodiacEnvironment, numEpisodes: int): + """ + Runs the project with random sampling, using the multiagent env + """ + + for episode in range(numEpisodes): + obs = env.reset() + done = False + score = 0 + agent_scores = [] + iterator = 0 + while not done: + #env.render() + iterator+=1 + #print(f'iterator:{iterator}') + # action_arr = env.action_space.sample() + + action_arr = [] + for i in range(self.numAgents): + action_arr.append(11) + + print(f'action for agents:\t{action_arr}') + + obs_arr, reward, isDone, info_arr = env.step(action_arr) + + agent_scores.append(reward) + + # print(f'rewards for agents:\t{reward}') + # print(f'obs for agents:\t{obs_arr}') + + info_arr = info_arr['n'] + print(f'px for agents:\t{info_arr}') + quantitySold = 0 + moneySales = 0 + for i, agentInfo in enumerate(info_arr): + agent_sales = info_arr[i]['sold'] + agent_price = info_arr[i]['price'] + agent_sales_in_money = agent_sales * agent_price + moneySales += agent_sales_in_money + quantitySold += agent_sales + + print(f'a_vending/quantity_sold_count: {quantitySold} at cost [{moneySales}]/[{env.peek_env_consumer_money()}. Consumer money per turn:{env.peek_env_consumer_money_each()}]') + if moneySales > env.peek_env_consumer_money(): + print(f'Money sales of [{moneySales}]/[{env.peek_env_consumer_money()}] were too high. Consumer money per turn:{env.peek_env_consumer_money_each()}') + return # raise Exception(f'Money sales of [{moneySales}]/[{env.peek_env_consumer_money()}] were too high. Consumer money per turn:{env.peek_env_consumer_money_each()}') + + + if done: + print(f'is done') + done = isDone + print(f'Episode:{episode} | \nAggregate agent scores:(Sum:{sum(agent_scores)})\n MeanAvg agent scores:({np.mean(agent_scores)})') + env.close() + + + def run_project_with_rand_test(self, env:MultiAgentMacodiacEnvironment, numEpisodes:int): + """ + Runs the project with random sampling, instead + of a model + + @param env: The environment to run this project with + @param numEpisodes: the count of episodes to run the environment for + """ + for episode in range(numEpisodes): + obs = env.reset() + done = False + score = 0 + while not done: + #env.render() + action = env.action_space.sample() + obs, reward, isTerminal, info = env.step(action) + score += reward + done = isTerminal + print(f'Episode:{episode} | Score:{score}') + env.close() + + + def run_project(self, env:MultiAgentMacodiacEnvironment, numEpisodes: int, model): + """ + Runs the project with an actual model, instead of random sampling + of a model + + @param env: The environment to run this project with + @param numEpisodes: the count of episodes to run the environment for + """ + scores = [] + for episode in range(numEpisodes): + obs = env.reset() + done = False + score = 0 + while not done: + #env.render() + action, _discard = model.predict(obs) + obs, reward, isTerminal, info = env.step(action) + score += reward + done = isTerminal + scores.append(score) + + runningAvg = np.mean(scores) + + print(f'Episode:\t{episode} \t| Score:\t{score} \t\t| RunningAvg: {round(runningAvg, 2)}') + env.close() + + + def create_model(self, env: MultiAgentMacodiacEnvironment, log_path: str): + model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, device="cpu") + return model + + + def train_model(self, model, numTimesteps: int, savePath:str, saveName: str): + """ + Trains a model with the number of iterations in + numtimesteps. Creates a n intermediate save every 1m iterations + + @param model: The model to train. The model must have been instantiated + @param numTimesteps: the number of training iterations + """ + + saveEveryNSteps = 1_000_000 + + if numTimesteps < saveEveryNSteps: + model.learn(total_timesteps=numTimesteps, + callback=TensorboardPriceCallback(), + tb_log_name=saveName) + + else: + rangeUpper = int(numTimesteps / saveEveryNSteps) + for i in range(1,rangeUpper+1): + model.learn(total_timesteps=saveEveryNSteps, + callback=TensorboardPriceCallback(), + tb_log_name=saveName) + model.save(os.path.join(savePath, f'interim-{i}')) + + return model + + def policy_evaluation(self, model, env: MultiAgentMacodiacEnvironment, numEpisodes:int=50): + """ + Prints a policy evaluation, including the mean episode reward + and the standard deviation + + @param model: The model to be evaluated + @param env: The environment to evaluate the model against + @param numEpisodes: The count of episodes to evaluate against + """ + print('\nevalResult:(mean episode reward, standard deviation)') + print(f'evalResult:{evaluate_policy(model, env, n_eval_episodes=numEpisodes)}\n') + + + + def save_model(self, model, modelPath): + """ + Saves a model to a given path + + @param model: The model to save + @param modelPath: The path to save to + """ + model.save(modelPath) + + + def load_model(self, env: MultiAgentMacodiacEnvironment, model, modelPath: str): + """ + Saves a model to a given path + + @param model: The model to save + @param modelPath: The modelPath to save to + """ + model = PPO.load(modelPath, env=env) + return model + + + + + +__MODE_OPTIONS__ = ['MONOPOLY', 'DUOPOLY', 'OLIGOPOLY', 'PERFECT_COMP'] +for mode in __MODE_OPTIONS__: + main = MultiagentMain(mode) + main.Run() \ No newline at end of file diff --git a/Macodiac.ML/multiagentenvironment.py b/Macodiac.ML/multiagentenvironment.py new file mode 100644 index 0000000..ce0742d --- /dev/null +++ b/Macodiac.ML/multiagentenvironment.py @@ -0,0 +1,444 @@ +import gymnasium as gym +from gym import Env +# use `gym.spaces` here, even though we're using `gymnasium` +# https://stackoverflow.com/questions/75108957/assertionerror-the-algorithm-only-supports-class-gym-spaces-box-box-as-acti +from gym import spaces +import numpy as np +import random +from random import shuffle +from stable_baselines3.common.callbacks import BaseCallback + +class AgentObject: + def __init__(self): + self.state = [] + self.reset_values() + + def reset_values(self): + self.vendingPrice = 0 + self.reward = 0 + self.quantitySold = 0 + self.vendCost = 5 + self.totalVendingCost = 0 + self.vendCostTrend = 'down' + +class ConsumerObject: + def __init__(self): + self.demand = 1 + self.utility = 0 + self.money = 0 + self.total_consumed = 0 + + +class TensorboardPriceCallback(BaseCallback): + """ + custom logger to record the price charged by agents + """ + runningAvgMeanPxOffered = 0 + runningAvgAcceptedVendedPx = 0 + + def __init__(self, verbose=0): + self.reset() + super().__init__(verbose) + + def reset(self): + self.iterator = 0 + + def _on_rollout_end(self) -> None: + self.reset() + return super()._on_rollout_end() + + def _on_step(self) -> bool: + self.reset() + # self.iterator +=1 + # agent_arr = self.training_env.get_attr('policy_agents')[0] + + ### generate a dict like: + # [{'agent_num': 0, 'price': 10, 'sold': 0.0, 'reward': 0.0}, + # {'agent_num': 1, 'price': 10, 'sold': 50.0, 'reward': 0.0}] + info_arr = self.locals['infos'][0]['n'] + + + pxList = [] + acceptedPxList = [] + vendCostList = [] + vendorsMadeSale = 0 + quantitySold = 0 + countNoSale = 0 + countWiSale = 0 + meanPxOffered = 0 + agent_sales = 0 + agent_vend_px = 0 + agent_reward = 0 + money_sales = 0 + agent_sales_in_money = 0 + agent_total_vend_cost = 0 + agent_final_vend_cost = 0 + meanPxAccepted = 0 + meanVendCost = 0 + + for i, agentInfo in enumerate(info_arr): + agent_sales = info_arr[i]['sold'] + agent_vend_px = info_arr[i]['price'] + agent_reward = info_arr[i]['reward'] + agent_final_vend_cost = info_arr[i]['vendCost'] + agent_total_vend_cost = info_arr[i]['totalVendingCost'] + agent_sales_in_money = agent_sales * agent_vend_px + money_sales += agent_sales_in_money + + pxList.append(agent_vend_px) + + if agent_sales > 0: + vendorsMadeSale += 1 + quantitySold += agent_sales + countWiSale += 1 + acceptedPxList.append(agent_vend_px) + vendCostList.append(agent_final_vend_cost) + else: + countNoSale += 1 + + self.logger.record(f'a_vending_agent_{agentInfo["agent_num"]}/offered_px', agent_vend_px) + self.logger.record(f'a_vending_agent_{agentInfo["agent_num"]}/sales_complete', agent_sales) + self.logger.record(f'a_vending_agent_{agentInfo["agent_num"]}/sales_value', agent_sales_in_money) + self.logger.record(f'a_vending_agent_{agentInfo["agent_num"]}/final_vend_cost', agent_final_vend_cost) + self.logger.record(f'a_vending_agent_{agentInfo["agent_num"]}/total_vend_cost', agent_total_vend_cost) + self.logger.record(f'a_vending_agent_{agentInfo["agent_num"]}/individual_reward', agent_reward) + + if len(pxList) > 0: + meanPxOffered = np.mean(pxList) + if len(acceptedPxList) > 0: + meanPxAccepted = np.mean(acceptedPxList) + if len(vendCostList) > 0: + meanVendCost = np.mean(vendCostList) + + + self.logger.record('a_vending/avgerage_offered_px_value', meanPxOffered) + self.logger.record('a_vending/average_accepted_px_value', meanPxAccepted) + self.logger.record('a_vending/average_final_vend_cost', meanVendCost) + self.logger.record('a_vending/quantity_sold_count', quantitySold) + self.logger.record('a_vending/total_value_sold', money_sales) + self.logger.record('a_vending/vendors_made_sale_count', vendorsMadeSale) + self.logger.record('a_vending/count_no_sale', countNoSale) + self.logger.record('a_vending/count_wi_sale', countWiSale) + + return True + +class MultiAgentMacodiacEnvironment(Env): + """ + Builds a profit maximising agent environment, supporting + up to n_agent agents + """ + state = 0 + environment_timesteps = 0 + environment_starter_timesteps = 150 + env_wholesale_price = 8 # the price agents pay to purchase goods + env_agent_marginal_cost = 0 # the marginal cost of vending + num_consumers = 25 + consumer_total_money_per_turn = 475 + consumers_arr = [] + + + def __init__(self, envTimesteps:int, numAgents: int): + """ + Initialises the class + """ + self.environment_starter_timesteps = envTimesteps + self.policy_agents = [] + self.consumers_arr = [] + self.observation_space = [] + + for i in range(numAgents): + self.policy_agents.append(AgentObject()) + + for i in range(self.num_consumers): + self.consumers_arr.append(ConsumerObject()) + + + # creates an array full of 10's shaped [20,20,20], of length numAgents + self.action_space = spaces.MultiDiscrete(np.full(numAgents, 15) ) + + + # the observation space is a nAgents by nActions array of float32 numbers between -99-99 + # also contains the wholesale price + # Observations space: + # 0: agent's state, after the action has been applied + # 1: agent's vending price in this round + # 2: agent's count of sold items + # 3: the wholesale price in this round + self.observation_space = spaces.Box(low=0,high=200, shape=(numAgents, 3), dtype=np.int32) + + print(f'obs_space.sample: {self.observation_space.sample()}') + + self.reset() + + + + print('-- ENV SETTINGS --') + print(f'obs:{self.observation_space}') + print(f'sample:{self.observation_space.sample()}') + print(self.action_space) + print(self.action_space.sample()) + print(self.environment_timesteps) + print('-- ENV SETTINGS --') + + def peek_env_consumer_money(self): + return self.consumer_total_money_per_turn + def peek_env_consumer_money_each(self): + return self.consumerMoneyEach + + def clear_consumer_stats(self, consumer): + consumer.money = self.consumerMoneyEach + + def clear_agent_stats(self, agent): + agent.reset_values() + + + + def set_agent_action(self, action, agent): + # agent.state is the percentage price diff from the wholesale price + agent.state = action + agent.vendingPrice = self.env_wholesale_price + agent.state + + if agent.vendingPrice == 0: + print(f'error') + agent.vendingPrice = max(1, agent.vendingPrice) + + + # agentBaseVendingPriceAdjust = self.env_wholesale_price * (agent.state / 100) + # baseAgentVendingPrice = self.env_wholesale_price + agentBaseVendingPriceAdjust + # #agentMarginalCostAddedVendingPrice = agentBaseVendingPriceAdjust + self.env_agent_marginal_cost + # agent.vendingPrice = max(baseAgentVendingPrice, 1) + # print(f'agent vending price was {agent.vendingPrice}') + + def step_agent(self, agent): + reward = agent.reward + info = {} + return agent.state, reward, False, info + + + def step(self, action_arr): + """ + Processes an action for an agent. + + Loops through each agent and sets its action. + Then calls world.step to progress the entire world's actions + + Builds up arrays of results, and returns them in a tuple of arrays + + """ + self.environment_timesteps -=1 + + obs_arr = [] + reward_arr = [] + done_arr = [] + info_arr = {'n': []} + + for i, agent in enumerate(self.policy_agents): + self.clear_agent_stats(agent) + self.set_agent_action(action_arr[i], agent) + + for i, consumer in enumerate(self.consumers_arr): + self.clear_consumer_stats(consumer) + self.alt_set_consumer_purchases(self.policy_agents, consumer) + + for i, agent in enumerate(self.policy_agents): + agent.state, agent.reward, agent.done, agent.info = self.step_agent(agent) + obs_arr.append(self._get_obs(agent)) + reward_arr.append(self._get_reward(agent)) + done_arr.append(self._get_done(agent)) + info_arr['n'].append(self._get_info(agent, i)) + + if self.environment_timesteps <= 0: + isTerminal = True + elif any(done_arr): + isTerminal = True + else: + isTerminal = False + + tmpObsArray = [] + for i, agent in enumerate(self.policy_agents): + partialObservationResult = self.get_agent_default_observation_array() + partialObservationResult[0] = self._get_obs(agent) #The agent's result is present in the 0th element of its result + partialObservationResult[1] = self._get_final_vend_price(agent) #The agent's result is present in the 0th element of its result + partialObservationResult[2] = self._get_quantity_sold(agent) #The agent's result is present in the 0th element of its result + tmpObsArray.append(partialObservationResult) + + concatObsArray = np.array(tmpObsArray).astype(np.int32) + return concatObsArray, float(sum(reward_arr)), isTerminal, info_arr + + def alt_set_consumer_purchases(self, agents_arr, consumer): + """ + So long as the consumer has money, loops through the agents, and selects the lowest + priced agent. + + if multiple agents share the same price points, distributes the sales across them all + """ + lowestAbsolutePrice = 0 + lowestPriceAgentIndexList = [] + vendingPrices = [] + + for i, agent in enumerate(agents_arr): + vendingPrices.append(agent.vendingPrice) + + + # print(f'prices are: {vendingPrices}') + lowestAbsolutePrice = min(vendingPrices) + + # gather all of the lowest price agents + for i, agent in enumerate(agents_arr): + if agent.vendingPrice == lowestAbsolutePrice: + lowestPriceAgentIndexList.append(i) + + shuffle(lowestPriceAgentIndexList) + + # while the consumer still has money, purchase + # items from the vendors + while consumer.money > 0: + # loop through each vendor, purchase one item from them + for agentIndex in lowestPriceAgentIndexList: + if consumer.money > 0: + agentToPurchaseFrom = agents_arr[agentIndex] + + if agentToPurchaseFrom.vendingPrice != lowestAbsolutePrice: + raise ValueError(f'agent vending price [{agentToPurchaseFrom.vendingPrice}] is not the same as lowestAbsPrice:[{lowestAbsolutePrice}]') + + if consumer.money >= agentToPurchaseFrom.vendingPrice: + if consumer.money < agentToPurchaseFrom.vendingPrice: + raise ValueError(f'consumer with: [{consumer.money}] money attempted to purchase from agent charging: [{agentToPurchaseFrom.vendingPrice}]') + consumer.money -= agentToPurchaseFrom.vendingPrice + # print(f'consumer money: {consumer.money}') + consumer.total_consumed += 1 + agentToPurchaseFrom.quantitySold += 1 + + # Marginal cost trends down towards 1, then increases upwards + if agentToPurchaseFrom.vendCostTrend == 'up': + agentToPurchaseFrom.vendCost += 0.66 + elif agentToPurchaseFrom.vendCostTrend == 'down': + agentToPurchaseFrom.vendCost -= 0.66 + if agentToPurchaseFrom.vendCost < 1: + agentToPurchaseFrom.vendCostTrend = 'up' + + agentToPurchaseFrom.totalVendingCost += agentToPurchaseFrom.vendCost + agentToPurchaseFrom.reward += (agentToPurchaseFrom.vendingPrice - self.env_wholesale_price - agentToPurchaseFrom.vendCost) + else: + # print(f'consumer money was {consumer.money}, setting to 0') + consumer.money = 0 + break + + + def set_consumer_purchases(self, agents_arr, consumer): + """ + So long as the consumer has money, loops through the agents, and selects the lowest + price agent. + + Purchases as many items from the agent as possible + """ + + lowestPriceAgnetIndex = 0 + vendingPrices = [] + + for i, agent in enumerate(agents_arr): + vendingPrices.append(agent.vendingPrice) + if agent.vendingPrice < agents_arr[lowestPriceAgnetIndex].vendingPrice: + lowestPriceAgnetIndex = i + + lowestAgentVendPrice = agents_arr[lowestPriceAgnetIndex].vendingPrice + + # instead of this while loop, just return the + quantityPurchasable = np.floor(consumer.money / lowestAgentVendPrice) + consumerConsumed = quantityPurchasable + tmpAgentRewardPerUnitSold = (lowestAgentVendPrice - self.env_wholesale_price) + agentReward = tmpAgentRewardPerUnitSold * consumerConsumed + + # consumer.money = 0 + consumer.total_consumed += consumerConsumed + agents_arr[lowestPriceAgnetIndex].reward += agentReward + agents_arr[lowestPriceAgnetIndex].quantitySold += consumerConsumed + return lowestPriceAgnetIndex, lowestAgentVendPrice, vendingPrices + + + def _get_quantity_sold(self, agent): + """ + accepts an agent, and returns the number of items it sold + """ + return agent.quantitySold + + def _get_final_vend_price(self, agent): + """ + accepts an agent and returns its final vending + """ + return agent.vendingPrice + + def _get_obs(self, agent): + """ + accepts an Agent, and returns its observation/state + """ + return agent.state + + def _get_reward(self, agent): + """ + accepts an Agent, and returns its reward + """ + return agent.reward + + def _get_done(self, agent): + """ + accepts an Agent, and returns its done/terminal property + """ + return agent.done + + def _get_info(self, agent , i): + """ + accepts an Agent, and returns its info object + """ + return { + "agent_num": i, + "price": agent.vendingPrice, + "sold": agent.quantitySold, + "reward": agent.reward, + "vendCost": agent.vendCost, + "totalVendingCost": agent.totalVendingCost + + } + + + def render(self) -> None: + """ + Does nothing, the environment is fully headless + """ + pass + + def reset(self): #-> float: + """ + Sets the application to its initial conditions + + Sets state to a random float between negative 100 to positive 100 + """ + obs_arr =[] + for i in range(len(self.policy_agents)): + self.policy_agents[i].state = np.array( + self.get_agent_default_observation_array(), + dtype=np.int32) + obs_arr.append(self.policy_agents[i].state) + self.policy_agents[i].reward = 0 + self.policy_agents[i].info = {} + self.policy_agents[i].done = False + self.policy_agents[i].vendingPrice = 0 + self.policy_agents[i].quantitySold = 0 + self.policy_agents[i].vendCost = 1 + + + self.consumerMoneyEach = self.consumer_total_money_per_turn / self.num_consumers + # for i in range(len(self.consumers_arr)): + # self.clear_consumer_stats(self.consumers_arr[i]) + # self.consumers_arr[i].money = consumerMoneyEach + + self.environment_timesteps = self.environment_starter_timesteps + + return np.array(obs_arr).astype(np.int32) + + + def get_agent_default_observation_array(self): + """ + Gets a default observation for this space + """ + return [0.0, 0.0, 0]# , self.env_wholesale_price] \ No newline at end of file