from meta_policy_search.utils import logger
from meta_policy_search.meta_algos.base import MAMLAlgo
from meta_policy_search.optimizers.maml_first_order_optimizer import MAMLPPOOptimizer
import tensorflow as tf
import numpy as np
from collections import OrderedDict
[docs]class ProMP(MAMLAlgo):
"""
ProMP Algorithm
Args:
policy (Policy): policy object
name (str): tf variable scope
learning_rate (float): learning rate for optimizing the meta-objective
num_ppo_steps (int): number of ProMP steps (without re-sampling)
num_minibatches (int): number of minibatches for computing the ppo gradient steps
clip_eps (float): PPO clip range
target_inner_step (float) : target inner kl divergence, used only when adaptive_inner_kl_penalty is true
init_inner_kl_penalty (float) : initial penalty for inner kl
adaptive_inner_kl_penalty (bool): whether to used a fixed or adaptive kl penalty on inner gradient update
anneal_factor (float) : multiplicative factor for annealing clip_eps. If anneal_factor < 1, clip_eps <- anneal_factor * clip_eps at each iteration
inner_lr (float) : gradient step size used for inner step
meta_batch_size (int): number of meta-learning tasks
num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
"""
def __init__(
self,
*args,
name="ppo_maml",
learning_rate=1e-3,
num_ppo_steps=5,
num_minibatches=1,
clip_eps=0.2,
target_inner_step=0.01,
init_inner_kl_penalty=1e-2,
adaptive_inner_kl_penalty=True,
anneal_factor=1.0,
**kwargs
):
super(ProMP, self).__init__(*args, **kwargs)
self.optimizer = MAMLPPOOptimizer(learning_rate=learning_rate, max_epochs=num_ppo_steps, num_minibatches=num_minibatches)
self.clip_eps = clip_eps
self.target_inner_step = target_inner_step
self.adaptive_inner_kl_penalty = adaptive_inner_kl_penalty
self.inner_kl_coeff = init_inner_kl_penalty * np.ones(self.num_inner_grad_steps)
self.anneal_coeff = 1
self.anneal_factor = anneal_factor
self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos']
self.name = name
self.kl_coeff = [init_inner_kl_penalty] * self.meta_batch_size * self.num_inner_grad_steps
self.build_graph()
def _adapt_objective_sym(self, action_sym, adv_sym, dist_info_old_sym, dist_info_new_sym):
with tf.variable_scope("likelihood_ratio"):
likelihood_ratio_adapt = self.policy.distribution.likelihood_ratio_sym(action_sym,
dist_info_old_sym, dist_info_new_sym)
with tf.variable_scope("surrogate_loss"):
surr_obj_adapt = -tf.reduce_mean(likelihood_ratio_adapt * adv_sym)
return surr_obj_adapt
[docs] def build_graph(self):
"""
Creates the computation graph
"""
""" Create Variables """
with tf.variable_scope(self.name):
self.step_sizes = self._create_step_size_vars()
""" --- Build inner update graph for adapting the policy and sampling trajectories --- """
# this graph is only used for adapting the policy and not computing the meta-updates
self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption()
""" ----- Build graph for the meta-update ----- """
self.meta_op_phs_dict = OrderedDict()
obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders('step0')
self.meta_op_phs_dict.update(all_phs_dict)
distribution_info_vars, current_policy_params = [], []
all_surr_objs, all_inner_kls = [], []
for i in range(self.meta_batch_size):
dist_info_sym = self.policy.distribution_info_sym(obs_phs[i], params=None)
distribution_info_vars.append(dist_info_sym) # step 0
current_policy_params.append(self.policy.policy_params) # set to real policy_params (tf.Variable)
with tf.variable_scope(self.name):
""" Inner updates"""
for step_id in range(1, self.num_inner_grad_steps+1):
surr_objs, kls, adapted_policy_params = [], [], []
# inner adaptation step for each task
for i in range(self.meta_batch_size):
surr_loss = self._adapt_objective_sym(action_phs[i], adv_phs[i], dist_info_old_phs[i], distribution_info_vars[i])
kl_loss = tf.reduce_mean(self.policy.distribution.kl_sym(dist_info_old_phs[i], distribution_info_vars[i]))
adapted_params_var = self._adapt_sym(surr_loss, current_policy_params[i])
adapted_policy_params.append(adapted_params_var)
kls.append(kl_loss)
surr_objs.append(surr_loss)
all_surr_objs.append(surr_objs)
all_inner_kls.append(kls)
# Create new placeholders for the next step
obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders('step%i' % step_id)
self.meta_op_phs_dict.update(all_phs_dict)
# dist_info_vars_for_next_step
distribution_info_vars = [self.policy.distribution_info_sym(obs_phs[i], params=adapted_policy_params[i])
for i in range(self.meta_batch_size)]
current_policy_params = adapted_policy_params
# per step: compute mean of kls over tasks
mean_inner_kl_per_step = tf.stack([tf.reduce_mean(tf.stack(inner_kls)) for inner_kls in all_inner_kls])
""" Outer objective """
surr_objs, outer_kls = [], []
# Create placeholders
inner_kl_coeff = tf.placeholder(tf.float32, shape=[self.num_inner_grad_steps], name='inner_kl_coeff')
self.meta_op_phs_dict['inner_kl_coeff'] = inner_kl_coeff
clip_eps_ph = tf.placeholder(tf.float32, shape=[], name='clip_eps')
self.meta_op_phs_dict['clip_eps'] = clip_eps_ph
# meta-objective
for i in range(self.meta_batch_size):
likelihood_ratio = self.policy.distribution.likelihood_ratio_sym(action_phs[i], dist_info_old_phs[i],
distribution_info_vars[i])
outer_kl = tf.reduce_mean(self.policy.distribution.kl_sym(dist_info_old_phs[i], distribution_info_vars[i]))
# clipped likelihood ratio
clipped_obj = tf.minimum(likelihood_ratio * adv_phs[i],
tf.clip_by_value(likelihood_ratio,
1 - clip_eps_ph,
1 + clip_eps_ph) * adv_phs[i])
surr_obj = - tf.reduce_mean(clipped_obj)
surr_objs.append(surr_obj)
outer_kls.append(outer_kl)
mean_outer_kl = tf.reduce_mean(tf.stack(outer_kls))
inner_kl_penalty = tf.reduce_mean(inner_kl_coeff * mean_inner_kl_per_step)
""" Mean over meta tasks """
meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0)) + inner_kl_penalty
self.optimizer.build_graph(
loss=meta_objective,
target=self.policy,
input_ph_dict=self.meta_op_phs_dict,
inner_kl=mean_inner_kl_per_step,
outer_kl=mean_outer_kl,
)
[docs] def optimize_policy(self, all_samples_data, log=True):
"""
Performs MAML outer step
Args:
all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and
meta task
log (bool) : whether to log statistics
Returns:
None
"""
meta_op_input_dict = self._extract_input_dict_meta_op(all_samples_data, self._optimization_keys)
# add kl_coeffs / clip_eps to meta_op_input_dict
meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff
meta_op_input_dict['clip_eps'] = self.clip_eps
if log: logger.log("Optimizing")
loss_before = self.optimizer.optimize(input_val_dict=meta_op_input_dict)
if log: logger.log("Computing statistics")
loss_after, inner_kls, outer_kl = self.optimizer.compute_stats(input_val_dict=meta_op_input_dict)
if self.adaptive_inner_kl_penalty:
if log: logger.log("Updating inner KL loss coefficients")
self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step)
if log:
logger.logkv('LossBefore', loss_before)
logger.logkv('LossAfter', loss_after)
logger.logkv('KLInner', np.mean(inner_kls))
logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff))
def adapt_kl_coeff(self, kl_coeff, kl_values, kl_target):
if hasattr(kl_values, '__iter__'):
assert len(kl_coeff) == len(kl_values)
return np.array([_adapt_kl_coeff(kl_coeff[i], kl, kl_target) for i, kl in enumerate(kl_values)])
else:
return _adapt_kl_coeff(kl_coeff, kl_values, kl_target)
def _adapt_kl_coeff(kl_coeff, kl, kl_target):
if kl < kl_target / 1.5:
kl_coeff /= 2
elif kl > kl_target * 1.5:
kl_coeff *= 2
return kl_coeff