InfoMARL/play_episode.py at master · djstrouse/InfoMARL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import itertools
import copy
import os
import pickle
import tensorflow as tf
import numpy as np
from envs.TwoGoalGridWorld import TwoGoalGridWorld
from agents.bob import RNNObserver
from agents.alice import TabularREINFORCE

def play_from_directory(experiment_name):

  cwd = os.getcwd()
  directory = cwd+'/results/'+experiment_name+'/'
  os.chdir(directory)
  #sys.path.append('/results/'+experiment_name)

  # unpickle results
  results = pickle.load(open(directory+'results.pkl','rb'))

  # import configs
  import alice_config
  alice_agent_param, alice_training_param, alice_experiment_name = alice_config.get_config()
  import env_config
  env_param, _ = env_config.get_config()
  import bob_config
  agent_param, training_param, experiment_name, alice_experiment = bob_config.get_config()

  # initialize experiment using configs
  tf.reset_default_graph()
  #global_step = tf.Variable(0, name = "global_step", trainable = False)
  env = TwoGoalGridWorld(shape = env_param.shape,
                         r_correct = env_param.r_correct,
                         r_incorrect = env_param.r_incorrect,
                         r_step = env_param.r_step,
                         r_wall = env_param.r_wall,
                         p_rand = env_param.p_rand,
                         goal_locs = env_param.goal_locs,
                         goal_dist = env_param.goal_dist)
  with tf.variable_scope('alice'):
    alice = TabularREINFORCE(env = env,
                             use_action_info = alice_agent_param.use_action_info,
                             use_state_info = alice_agent_param.use_state_info)
    #alice_saver = tf.train.Saver()
  with tf.variable_scope('bob'):
    bob = RNNObserver(env = env,
                      shared_layer_sizes = agent_param.shared_layer_sizes,
                      policy_layer_sizes = agent_param.policy_layer_sizes,
                      value_layer_sizes = agent_param.value_layer_sizes,
                      use_RNN = agent_param.use_RNN)
    bob_saver = tf.train.Saver()

  # simulate an episode
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #alice_saver.restore(sess, directory+'alice/alice.ckpt')
    bob_saver.restore(sess, directory+'bob/bob.ckpt')
    play(env = env, alice = alice, bob = bob, results = results,
         bob_goal_access = training_param.bob_goal_access,
         gamma = training_param.discount_factor)

  os.chdir(cwd)

  return

def play(env, alice, bob = None, state_goal_counts = None,
         max_episode_length = 100, bob_goal_access = None, gamma = None):
  # if alice.use_state_info, need to include her state_goal_counts

  if type(env) == tuple: # for games where each agent has different env (e.g. KeyGame)
    alice_env = env[0]
    bob_env = env[1]
  else: # for games where agents use same env (e.g. TwoGoalGridWorld)
    alice_env = env
    if bob: bob_env = copy.copy(alice_env)

  alice_state, goal = alice_env._reset()
  if bob: bob_state, _ = bob_env.set_goal(goal)
  else: bob_state = None

  alice_states = []
  alice_actions = []
  alice_done = False
  alice_total_reward = 0
  alice_episode_length = 0
  if alice.use_action_info: total_kl = 0
  else: total_kl = None
  if alice.use_state_info: total_lso = 0
  else: total_lso = None
  draw_alice = True

  if bob:
    bob_done = False
    bob_rewards = []
    bob_total_reward = 0
    bob_episode_length = 0
    draw_bob = True
  else:
    bob_done = True
    draw_bob = False

  # draw initial env
  print('')
  if type(env) == tuple:
    print('alice')
    alice_env._render()
    print('')
    print('bob')
    bob_env._render()
  else:
    alice_env._render(bob_state = bob_state)
  print('')

  # one step in the environment
  for t in itertools.count(start = 1):

    # first alice
    if not alice_done:

      # take a step
      alice_action_probs, alice_value, alice_logits = alice.predict(alice_state, goal)
      alice_action = np.random.choice(np.arange(len(alice_action_probs)), p = alice_action_probs)
      next_alice_state, alice_reward, alice_done, _ = alice_env.step(alice_action)

      # update stats
      if alice.use_action_info:
        total_kl += alice.get_kl(state = alice_state, goal = goal)
      if alice.use_state_info:
        ps_g = state_goal_counts[alice_state, goal] / np.sum(state_goal_counts[:,goal])
        ps = np.sum(state_goal_counts[alice_state,:]) / np.sum(state_goal_counts)
        total_lso += np.log2(ps_g/ps)

      alice_total_reward += alice_reward
      alice_episode_length = t

    else: # if done, sit still
      alice_action = alice_env.action_to_index['STAY']
      next_alice_state = alice_state
    alice_states.append(alice_state)
    alice_actions.append(alice_action)

    # draw env with alice step
    if draw_alice:
      if total_kl is not None: kl_str = ', tot kl = %.2f' % total_kl
      else: kl_str = ''
      if total_lso is not None: lso_str = ', tot lso = %.2f' % total_lso
      else: lso_str = ''
      if str(alice_env) == 'KeyGame': key_str = ', key = {}'.format(alice_env.state_to_key[alice_state])
      else: key_str = ''
      print('alice step %i: reward = %.1f%s%s%s, action: %s' %
            (t, alice_reward, kl_str, lso_str, key_str, alice_env.index_to_action[alice_action]))
      print('policy: L = %.2f, U = %.2f, R = %.2f, D = %.2f, S = %.2f' %
            (alice_action_probs[alice_env.action_to_index['LEFT']],
             alice_action_probs[alice_env.action_to_index['UP']],
             alice_action_probs[alice_env.action_to_index['RIGHT']],
             alice_action_probs[alice_env.action_to_index['DOWN']],
             alice_action_probs[alice_env.action_to_index['STAY']]))
      print('logits: L = %.2f, U = %.2f, R = %.2f, D = %.2f, S = %.2f' %
            (alice_logits[alice_env.action_to_index['LEFT']],
             alice_logits[alice_env.action_to_index['UP']],
             alice_logits[alice_env.action_to_index['RIGHT']],
             alice_logits[alice_env.action_to_index['DOWN']],
             alice_logits[alice_env.action_to_index['STAY']]))
      print('')
      if type(env) == tuple:
        alice_env._render()
      else:
        alice_env._render(bob_state = bob_state)
      print('')
      if alice_done: draw_alice = False # only draw alice step first step after done

    # then bob takes a step
    if not bob_done:
      if bob_goal_access is None:
        bob_action_probs, bob_value, z, logits = bob.predict(state = bob_state,
                                                             obs_states = alice_states,
                                                             obs_actions = alice_actions)
      elif bob_goal_access == 'immediate':
        if goal == 0: z = [-1]
        elif goal == 1: z = [+1]
        bob_action_probs, bob_value, _, logits = bob.predict(state = bob_state,
                                                             z = z)
      elif bob_goal_access == 'delayed':
        kl_thresh = .8
        if total_kl>kl_thresh:
          if goal == 0: z = [-1]
          elif goal == 1: z = [+1]
        else:
          z = [0]
        bob_action_probs, bob_value, _, logits = bob.predict(state = bob_state,
                                                             z = z)
      bob_action = np.random.choice(np.arange(len(bob_action_probs)), p = bob_action_probs)
      next_bob_state, bob_reward, bob_done, _ = bob_env.step(bob_action)
      bob_total_reward += bob_reward
      bob_rewards.append(bob_reward)
      bob_episode_length = t
    else: # if done, sit still
      next_bob_state = bob_state

    # draw env with bob step
    if draw_bob:
      if bob_goal_access is not None: z = z[0]
      if str(bob_env) == 'KeyGame': key_str = ', key = {}'.format(bob_env.state_to_key[bob_state])
      else: key_str = ''
      print('bob step %i: reward = %i, value = %.2f, rnn latent = %.2f%s, action: %s' %
            (t, bob_total_reward, bob_value, z, key_str, bob_env.index_to_action[bob_action]))
      print('policy: L = %.2f, U = %.2f, R = %.2f, D = %.2f, S = %.2f' %
            (bob_action_probs[bob_env.action_to_index['LEFT']],
             bob_action_probs[bob_env.action_to_index['UP']],
             bob_action_probs[bob_env.action_to_index['RIGHT']],
             bob_action_probs[bob_env.action_to_index['DOWN']],
             bob_action_probs[bob_env.action_to_index['STAY']]))
      print('logits: L = %.2f, U = %.2f, R = %.2f, D = %.2f, S = %.2f' %
            (logits[bob_env.action_to_index['LEFT']],
             logits[bob_env.action_to_index['UP']],
             logits[bob_env.action_to_index['RIGHT']],
             logits[bob_env.action_to_index['DOWN']],
             logits[bob_env.action_to_index['STAY']]))
      print('')
      if type(env) == tuple:
        bob_env._render()
      else:
        alice_env._render(bob_state = next_bob_state)
      print('')
      if bob_done: draw_bob = False # only draw bob step first step after done

    if (alice_done and bob_done) or t > max_episode_length: break

    alice_state = next_alice_state
    bob_state = next_bob_state

  # print bob's return for each step
  if gamma is not None:
    returns = [sum(np.array([gamma**i for i in range(len(bob_rewards)-t)])*np.array(bob_rewards[t:])) for t in range(len(bob_rewards))]
    str_returns = ['%.2f' % r for r in returns]
    print('bob returns:', end = ' ')
    for i in range(len(str_returns)):
      print('%i: %s' % (i+1,str_returns[i].lstrip('0')), end = '')
      if i<len(str_returns)-1: print(',', end = ' ')
    print('')

if __name__ == "__main__":
  play_from_directory('job16321705_task6_2018_03_03_020524_bob_with_cooperative_alice_shared128_1M_5x5')