Implementation of DDPG on Gym
Algorithm of DDPG
The DDPG algorithm was described clearly here.
- DDPG Algorithm Pseudocode
Implementation
Dataset Buffer
\[D = \{(s,a,r,s',d), \cdots \}.\]class ReplayBuffer:
"""
A simple FIFO experience replay buffer for DDPG agents.
"""
def __init__(self, obs_dim, act_dim, size):
self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
self.rews_buf = np.zeros(size, dtype=np.float32)
self.done_buf = np.zeros(size, dtype=np.float32)
self.ptr, self.size, self.max_size = 0, 0, size
def store(self, obs, act, rew, next_obs, done):
self.obs1_buf[self.ptr] = obs
self.obs2_buf[self.ptr] = next_obs
self.acts_buf[self.ptr] = act
self.rews_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size+1, self.max_size)
def sample_batch(self, batch_size=32):
idxs = np.random.randint(0, self.size, size=batch_size)
return dict(obs1=self.obs1_buf[idxs],
obs2=self.obs2_buf[idxs],
acts=self.acts_buf[idxs],
rews=self.rews_buf[idxs],
done=self.done_buf[idxs])
# Experience buffer
replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
Input
# core.py
def placeholder(dim=None):
return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
def placeholders(*args):
return [placeholder(dim) for dim in args]
# Inputs to computation graph
x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
Actor-Critic Multi-Layer Perception (MLP)
def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
for h in hidden_sizes[:-1]:
# [batch_size, obs_dim] => [batch_size, h]
x = tf.layers.dense(x, units=h, activation=activation)
# [batch_size, h] => [batch_size, units]
return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu,
output_activation=tf.tanh, action_space=None):
act_dim = a.shape.as_list()[-1]
act_limit = action_space.high[0]
with tf.variable_scope('pi'):
# [batch_size, obs_dim] => [batch_size, act_dim]
pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
with tf.variable_scope('q'):
# [batch_size, obs_dim + act_dim] => [batch_size, 1] => [batch_size]
q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
with tf.variable_scope('q', reuse=True):
# [batch_size, obs_dim + act_dim] => [batch_size, 1] => [batch_size]
q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
return pi, q, q_pi
Output
"""
actor_critic: A function which takes in placeholder symbols
for state, ``x_ph``, and action, ``a_ph``, and returns the main
outputs from the agent's Tensorflow computation graph:
=========== ================ ======================================
Symbol Shape Description
=========== ================ ======================================
``pi`` (batch, act_dim) | Deterministically computes actions
| from policy given states.
``q`` (batch,) | Gives the current estimate of Q* for
| states in ``x_ph`` and actions in
| ``a_ph``.
``q_pi`` (batch,) | Gives the composition of ``q`` and
| ``pi`` for states in ``x_ph``:
| q(x, pi(x)).
=========== ================ ======================================
"""
actor_critic=core.mlp_actor_critic
# Main outputs from computation graph
with tf.variable_scope('main'):
pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
# Target networks
with tf.variable_scope('target'):
# Note that the action placeholder going to actor_critic here is
# irrelevant, because we only need q_targ(s, pi_targ(s)).
pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)
Loss Function
\[y(r,s',d) = r + \gamma (1-d) Q_{\phi_{targ}} (s', \mu_{\theta_{targ}}(s')),\] \[Q_{loss} = \frac{1}{\vert B \vert} \sum_{(s,a,r,s',d) \in B} (Q_\phi (s,a) - y(r,s',d))^2,\] \[\pi_{loss} = \frac{1}{\vert B \vert} \sum_{s \in B} Q_\phi (s, \mu_\theta (s)).\]# Bellman backup for Q function
backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)
# DDPG losses
pi_loss = -tf.reduce_mean(q_pi)
q_loss = tf.reduce_mean((q-backup)**2)
Optimizer
# Separate train ops for pi, q
pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
Update Target Parameters
\[\phi_{targ} = \rho \phi_{targ} + (1-\rho) \phi,\] \[\theta_{targ} = \rho \theta_{targ} + (1-\rho) \theta.\]def get_vars(scope):
return [x for x in tf.global_variables() if scope in x.name]
# Polyak averaging for target variables
# tf.assign: update 'v_targ' by assigning 'polyak*v_targ + (1-polyak)*v_main' to it
# tf.group: return all the operations in ()
target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
# Initializing targets to match main variables
target_init = tf.group([tf.assign(v_targ, v_main)
for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
TensorBoard
# tensorboard
tf.summary.scalar("pi_loss", pi_loss)
tf.summary.scalar("q_loss", q_loss)
summary_op = tf.summary.merge_all()
Initial Variables
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(target_init)
writer = tf.summary.FileWriter('graphs', graph=tf.get_default_graph())
Run Session
# Setup model saving
logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})
# get the action use 'pi' network, 'observation' as input, add noise
def get_action(o, noise_scale):
a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
a += noise_scale * np.random.randn(act_dim)
return np.clip(a, -act_limit, act_limit)
# test the updated policy, make the action noise as 0
def test_agent(n=10):
for j in range(n):
o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
while not(d or (ep_len == max_ep_len)):
# Take deterministic actions at test time (noise_scale=0)
o, r, d, _ = test_env.step(get_action(o, 0))
ep_ret += r
ep_len += 1
logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
start_time = time.time()
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
total_steps = steps_per_epoch * epochs
# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):
"""
Until start_steps have elapsed, randomly sample actions
from a uniform distribution for better exploration. Afterwards,
use the learned policy (with some noise, via act_noise).
"""
# o => a => o2, r, d => (o, a, r, o2, d) => store to replay buffer => o = o2 => ...
if t > start_steps: # 10000
a = get_action(o, act_noise) # get action use 'pi' network
else:
a = env.action_space.sample() # stochastic action
# Step the env
o2, r, d, _ = env.step(a)
ep_ret += r # reward
ep_len += 1 # number of step
# Ignore the "done" signal if it comes from hitting the time
# horizon (that is, when it's an artificial terminal signal
# that isn't based on the agent's state)
d = False if ep_len==max_ep_len else d
# Store experience to replay buffer
replay_buffer.store(o, a, r, o2, d)
# Super critical, easy to overlook step: make sure to update
# most recent observation!
o = o2
if d or (ep_len == max_ep_len):
"""
Perform all DDPG updates at the end of the trajectory,
in accordance with tuning done by TD3 paper authors.
"""
for _ in range(ep_len):
# sample from the replay buffer
batch = replay_buffer.sample_batch(batch_size)
feed_dict = {x_ph: batch['obs1'],
x2_ph: batch['obs2'],
a_ph: batch['acts'],
r_ph: batch['rews'],
d_ph: batch['done']
}
# Q-learning update
outs = sess.run([q_loss, q, train_q_op], feed_dict)
logger.store(LossQ=outs[0], QVals=outs[1])
# Policy update, target parameters update
outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
logger.store(LossPi=outs[0])
logger.store(EpRet=ep_ret, EpLen=ep_len)
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
# End of epoch wrap-up
if t > 0 and t % steps_per_epoch == 0:
epoch = t // steps_per_epoch
# Save model
if (epoch % save_freq == 0) or (epoch == epochs-1):
logger.save_state({'env': env}, None)
# Test the performance of the deterministic version of the agent.
test_agent()
# Log info about epoch
logger.log_tabular('Epoch', epoch)
logger.log_tabular('EpRet', with_min_and_max=True)
logger.log_tabular('TestEpRet', with_min_and_max=True)
logger.log_tabular('EpLen', average_only=True)
logger.log_tabular('TestEpLen', average_only=True)
logger.log_tabular('TotalEnvInteracts', t)
logger.log_tabular('QVals', with_min_and_max=True)
logger.log_tabular('LossPi', average_only=True)
logger.log_tabular('LossQ', average_only=True)
logger.log_tabular('Time', time.time()-start_time)
logger.dump_tabular()
summary = sess.run(summary_op, feed_dict)
writer.add_summary(summary, epoch)
TensorBoard Results
- pi_loss on HalfCheetah
- q_loss on HalfCheetah