Implementation of DDPG on Gym

Algorithm of DDPG

The DDPG algorithm was described clearly here.

: DDPG Algorithm Pseudocode

Implementation

Dataset Buffer

\[D = \{(s,a,r,s',d), \cdots \}.\]

class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for DDPG agents.
    """

    def __init__(self, obs_dim, act_dim, size):
        self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size

    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return dict(obs1=self.obs1_buf[idxs],
                    obs2=self.obs2_buf[idxs],
                    acts=self.acts_buf[idxs],
                    rews=self.rews_buf[idxs],
                    done=self.done_buf[idxs])

# Experience buffer
replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

Input

# core.py
def placeholder(dim=None):
    return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))

def placeholders(*args):
    return [placeholder(dim) for dim in args]

# Inputs to computation graph
x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)

Actor-Critic Multi-Layer Perception (MLP)

def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
    for h in hidden_sizes[:-1]:
        # [batch_size, obs_dim] => [batch_size, h]
        x = tf.layers.dense(x, units=h, activation=activation)
    # [batch_size, h] => [batch_size, units]
    return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)

def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 
                     output_activation=tf.tanh, action_space=None):
    act_dim = a.shape.as_list()[-1]
    act_limit = action_space.high[0]
    with tf.variable_scope('pi'):
        # [batch_size, obs_dim] => [batch_size, act_dim]
        pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
    with tf.variable_scope('q'):
        # [batch_size, obs_dim + act_dim] => [batch_size, 1] => [batch_size]
        q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
    with tf.variable_scope('q', reuse=True):
        # [batch_size, obs_dim + act_dim] => [batch_size, 1] => [batch_size]
        q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
    return pi, q, q_pi

Output

"""
actor_critic: A function which takes in placeholder symbols 
    for state, ``x_ph``, and action, ``a_ph``, and returns the main 
    outputs from the agent's Tensorflow computation graph:

    ===========  ================  ======================================
    Symbol       Shape             Description
    ===========  ================  ======================================
    ``pi``       (batch, act_dim)  | Deterministically computes actions
                                   | from policy given states.
    ``q``        (batch,)          | Gives the current estimate of Q* for 
                                   | states in ``x_ph`` and actions in
                                   | ``a_ph``.
    ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                   | ``pi`` for states in ``x_ph``: 
                                   | q(x, pi(x)).
    ===========  ================  ======================================
"""
actor_critic=core.mlp_actor_critic

# Main outputs from computation graph
with tf.variable_scope('main'):
    pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

# Target networks
with tf.variable_scope('target'):
    # Note that the action placeholder going to actor_critic here is 
    # irrelevant, because we only need q_targ(s, pi_targ(s)).
    pi_targ, _, q_pi_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)

Loss Function

\[y(r,s',d) = r + \gamma (1-d) Q_{\phi_{targ}} (s', \mu_{\theta_{targ}}(s')),\] \[Q_{loss} = \frac{1}{\vert B \vert} \sum_{(s,a,r,s',d) \in B} (Q_\phi (s,a) - y(r,s',d))^2,\] \[\pi_{loss} = \frac{1}{\vert B \vert} \sum_{s \in B} Q_\phi (s, \mu_\theta (s)).\]

# Bellman backup for Q function
backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)

# DDPG losses
pi_loss = -tf.reduce_mean(q_pi)
q_loss = tf.reduce_mean((q-backup)**2)

Optimizer

# Separate train ops for pi, q
pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

Update Target Parameters

\[\phi_{targ} = \rho \phi_{targ} + (1-\rho) \phi,\] \[\theta_{targ} = \rho \theta_{targ} + (1-\rho) \theta.\]

def get_vars(scope):
    return [x for x in tf.global_variables() if scope in x.name]

# Polyak averaging for target variables
# tf.assign: update 'v_targ' by assigning 'polyak*v_targ + (1-polyak)*v_main' to it
# tf.group: return all the operations in ()
target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                          for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

# Initializing targets to match main variables
target_init = tf.group([tf.assign(v_targ, v_main)
                          for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

TensorBoard

# tensorboard
tf.summary.scalar("pi_loss", pi_loss)
tf.summary.scalar("q_loss", q_loss)
summary_op = tf.summary.merge_all()

Initial Variables

sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(target_init)

writer = tf.summary.FileWriter('graphs', graph=tf.get_default_graph())

Run Session

# Setup model saving
logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})

# get the action use 'pi' network, 'observation' as input, add noise
def get_action(o, noise_scale):
    a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
    a += noise_scale * np.random.randn(act_dim)
    return np.clip(a, -act_limit, act_limit)

# test the updated policy, make the action noise as 0
def test_agent(n=10):
    for j in range(n):
        o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
        while not(d or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            o, r, d, _ = test_env.step(get_action(o, 0))
            ep_ret += r
            ep_len += 1
        logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

start_time = time.time()
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
total_steps = steps_per_epoch * epochs

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):

    """
    Until start_steps have elapsed, randomly sample actions
    from a uniform distribution for better exploration. Afterwards, 
    use the learned policy (with some noise, via act_noise). 
    """
    # o => a => o2, r, d => (o, a, r, o2, d) => store to replay buffer => o = o2 => ...
    if t > start_steps:    # 10000
        a = get_action(o, act_noise)    # get action use 'pi' network
    else:
        a = env.action_space.sample()   # stochastic action

    # Step the env
    o2, r, d, _ = env.step(a)
    ep_ret += r     # reward
    ep_len += 1     # number of step

    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    # Store experience to replay buffer
    replay_buffer.store(o, a, r, o2, d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o = o2

    if d or (ep_len == max_ep_len):
        """
        Perform all DDPG updates at the end of the trajectory,
        in accordance with tuning done by TD3 paper authors.
        """
        for _ in range(ep_len):
            # sample from the replay buffer
            batch = replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done']
                        }

            # Q-learning update
            outs = sess.run([q_loss, q, train_q_op], feed_dict)
            logger.store(LossQ=outs[0], QVals=outs[1])

            # Policy update, target parameters update
            outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
            logger.store(LossPi=outs[0])

        logger.store(EpRet=ep_ret, EpLen=ep_len)
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # End of epoch wrap-up
    if t > 0 and t % steps_per_epoch == 0:
        epoch = t // steps_per_epoch

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)

        # Test the performance of the deterministic version of the agent.
        test_agent()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('TestEpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TestEpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', t)
        logger.log_tabular('QVals', with_min_and_max=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossQ', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()

        summary = sess.run(summary_op, feed_dict)
        writer.add_summary(summary, epoch)

TensorBoard Results

: pi_loss on HalfCheetah

: q_loss on HalfCheetah