Fixes LOG_PROBABILITY action info in RewardPredictionBasePolicy

TF-Agents Team · copybara-github · commit 6c12f7ddf3fd · 2022-08-30T07:30:41.000-07:00
The `RewardPredictionBasePolicy` overwrites the `_distribution` method by returning a deterministic distribution of sampled actions, and populating the action probabilities in the returned policy info. However, it sets the `emit_log_probability` parameter in the base TFPolicy class initializer, which causes the base `TFPolicy.action` method to ignore the log probabilities returned by sub-classes, and instead invoke the distribution object returned by sub-classes to obtain the log probabilities. But because `RewardPredictionBasePolicy` returns a deterministic distribution, the log probabilities are always 0.

To fix this, we change `RewardPredictionBasePolicy._distribution` to return the actual distribution instead of a deterministic one based on sampled actions. Because it returns the actual distribution, it can no longer populate `chosen_arm_features` because the actual action sampling has not happened yet. We therefore  also overwrite the `_action` method in `RewardPredictionBasePolicy` to populate `chosen_arm_features` after action sampling.

In addition, we added a few tests for `BoltzmanRewardPredictionPolicy` to verify the expected behavior around its parameters.

PiperOrigin-RevId: 470985015
Change-Id: I0bce336d9ab4b60811401129da87e0d84866d9ac
diff --git a/tf_agents/bandits/policies/boltzmann_reward_prediction_policy.py b/tf_agents/bandits/policies/boltzmann_reward_prediction_policy.py
@@ -30,6 +30,10 @@
 from tf_agents.policies import utils as policy_utilities
 from tf_agents.typing import types
 
+# The temperature parameter is internally lower-bounded at this value to avoid
+# numerical issues.
+_MIN_TEMPERATURE = 1e-12
+
 
 class BoltzmannRewardPredictionPolicy(
     reward_prediction_base_policy.RewardPredictionBasePolicy):
@@ -45,7 +49,7 @@ def __init__(
       observation_and_action_constraint_splitter: Optional[
           types.Splitter] = None,
       accepts_per_arm_features: bool = False,
-      constraints: Iterable[constr.NeuralConstraint] = (),
+      constraints: Iterable[constr.BaseConstraint] = (),
       emit_policy_info: Tuple[Text, ...] = (),
       num_samples_list: Sequence[tf.Variable] = (),
       name: Optional[Text] = None):
@@ -77,7 +81,7 @@ def __init__(
       accepts_per_arm_features: (bool) Whether the policy accepts per-arm
         features.
       constraints: iterable of constraints objects that are instances of
-        `tf_agents.bandits.agents.NeuralConstraint`.
+        `tf_agents.bandits.agents.BaseConstraint`.
       emit_policy_info: (tuple of strings) what side information we want to get
         as part of the policy info. Allowed values can be found in
         `policy_utilities.PolicyInfo`.
@@ -119,11 +123,12 @@ def __init__(
             self._expected_num_actions)
 
   def _get_temperature_value(self):
-    if callable(self._temperature):
-      return self._temperature()
-    return self._temperature
+    return tf.math.maximum(
+        _MIN_TEMPERATURE,
+        self._temperature()
+        if callable(self._temperature) else self._temperature)
 
-  def _sample_action(self, mask, predicted_rewards):
+  def _action_distribution(self, mask, predicted_rewards):
     batch_size = tf.shape(predicted_rewards)[0]
     if self._boltzmann_gumbel_exploration_constant is not None:
       logits = predicted_rewards
@@ -146,9 +151,11 @@ def _sample_action(self, mask, predicted_rewards):
       final_logits = logits + exploration_weights * gumbel_samples
       actions = tf.cast(
           tf.math.argmax(final_logits, axis=1), self._action_spec.dtype)
-      # Log probability is not available in closed form. We treat this as a
-      # deterministic policy at the moment.
-      log_probability = tf.zeros([batch_size], tf.float32)
+      # To conform with the return type, we construct a deterministic
+      # distribution here. Note that this results in the log_probability of
+      # the chosen arm being 0. The true sampling probability here has no simple
+      # closed-form.
+      distribution = tfp.distributions.Deterministic(loc=actions)
     else:
       # Apply the temperature scaling, needed for Boltzmann exploration.
       logits = predicted_rewards / self._get_temperature_value()
@@ -170,9 +177,6 @@ def _sample_action(self, mask, predicted_rewards):
             logits=logits,
             dtype=self._action_spec.dtype)
 
-      actions = distribution.sample()
-      log_probability = distribution.log_prob(actions)
-
     bandit_policy_values = tf.fill([batch_size, 1],
                                    policy_utilities.BanditPolicyType.BOLTZMANN)
-    return actions, log_probability, bandit_policy_values
+    return distribution, bandit_policy_values
diff --git a/tf_agents/bandits/policies/boltzmann_reward_prediction_policy_test.py b/tf_agents/bandits/policies/boltzmann_reward_prediction_policy_test.py
@@ -14,13 +14,13 @@
 # limitations under the License.
 
 """Test for boltzmann_reward_prediction_policy."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+import numpy as np
 
 import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
 from tf_agents.bandits.policies import boltzmann_reward_prediction_policy as boltzmann_reward_policy
 from tf_agents.networks import network
+from tf_agents.policies import utils
 from tf_agents.specs import tensor_spec
 from tf_agents.trajectories import time_step as ts
 from tf_agents.utils import test_utils
@@ -73,7 +73,7 @@ def testBoltzmannGumbelPredictedRewards(self):
         self._action_spec,
         reward_network=DummyNet(self._obs_spec),
         boltzmann_gumbel_exploration_constant=10.0,
-        emit_policy_info=('predicted_rewards_mean',),
+        emit_policy_info=(utils.InfoFields.PREDICTED_REWARDS_MEAN,),
         num_samples_list=num_samples_list)
     observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
@@ -85,6 +85,135 @@ def testBoltzmannGumbelPredictedRewards(self):
     p_info = self.evaluate(action_step.info)
     self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
 
+  def testLargeTemperature(self):
+    # With a very large temperature, the sampling probability will be uniform.
+    policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
+        self._time_step_spec,
+        self._action_spec,
+        reward_network=DummyNet(self._obs_spec),
+        temperature=10e8,
+        emit_policy_info=(utils.InfoFields.LOG_PROBABILITY,))
+    batch_size = 3000
+    observations = tf.constant([[1, 2]] * batch_size, dtype=tf.float32)
+    time_step = ts.restart(observations, batch_size=batch_size)
+    action_step = policy.action(time_step, seed=1)
+    # Initialize all variables
+    self.evaluate(tf.compat.v1.global_variables_initializer())
+    p_info = self.evaluate(action_step.info)
+    # Check the log probabilities in the policy info are uniform.
+    self.assertAllEqual(p_info.log_probability,
+                        tf.math.log([1.0 / 3] * batch_size))
+    # Check the empirical distribution of the chosen arms is uniform.
+    actions = self.evaluate(action_step.action)
+    self.assertAllInSet(actions, [0, 1, 2])
+    # Set tolerance in the chosen count to be 4 std.
+    tol = 4.0 * np.sqrt(batch_size * 1.0 / 3 * 2.0 / 3)
+    for action in range(3):
+      action_chosen_count = np.sum(actions == action)
+      self.assertNear(
+          action_chosen_count,
+          1000,
+          tol,
+          msg=f'action: {action} is expected to be chosen between {1000 - tol} '
+          f'and {1000 + tol} times, but was actually chosen '
+          f'{action_chosen_count} times.')
+
+  def testZeroTemperature(self):
+    # With zero temperature, the chosen actions should be greedy.
+    policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
+        self._time_step_spec,
+        self._action_spec,
+        reward_network=DummyNet(self._obs_spec),
+        temperature=0.0,
+        emit_policy_info=(utils.InfoFields.LOG_PROBABILITY,))
+    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+    time_step = ts.restart(observations, batch_size=2)
+    action_step = policy.action(time_step, seed=1)
+    # Initialize all variables
+    self.evaluate(tf.compat.v1.global_variables_initializer())
+    actions = self.evaluate(action_step.action)
+    self.assertAllEqual(actions, [1, 2])
+
+  def testZeroGumbelExploration(self):
+    # When the Boltzmann-Gumbel exploration constant is almost 0, the chosen
+    # actions should be greedy actions.
+    num_samples_list = []
+    for k in range(3):
+      num_samples_list.append(
+          tf.compat.v2.Variable(
+              tf.zeros([], dtype=tf.int32), name='num_samples_{}'.format(k)))
+    num_samples_list[0].assign_add(2)
+    num_samples_list[1].assign_add(4)
+    num_samples_list[2].assign_add(1)
+    policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
+        self._time_step_spec,
+        self._action_spec,
+        reward_network=DummyNet(self._obs_spec),
+        boltzmann_gumbel_exploration_constant=1e-12,
+        num_samples_list=num_samples_list,
+        emit_policy_info=(utils.InfoFields.PREDICTED_REWARDS_MEAN,))
+    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+    time_step = ts.restart(observations, batch_size=2)
+    action_step = policy.action(time_step, seed=1)
+    # Initialize all variables
+    self.evaluate(tf.compat.v1.global_variables_initializer())
+    actions = self.evaluate(action_step.action)
+    self.assertAllEqual(actions, [1, 2])
+
+  def testAllLargeNumSamples(self):
+    # When every action has a very large number of samples, the chosen actions
+    # should be greedy actions.
+    num_samples_list = []
+    for k in range(3):
+      num_samples_list.append(
+          tf.compat.v2.Variable(
+              tf.zeros([], dtype=tf.int32), name='num_samples_{}'.format(k)))
+    num_samples_list[0].assign_add(tf.int32.max - 10)
+    num_samples_list[1].assign_add(tf.int32.max - 10)
+    num_samples_list[2].assign_add(tf.int32.max - 10)
+    policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
+        self._time_step_spec,
+        self._action_spec,
+        reward_network=DummyNet(self._obs_spec),
+        boltzmann_gumbel_exploration_constant=100.0,
+        num_samples_list=num_samples_list,
+        emit_policy_info=(utils.InfoFields.PREDICTED_REWARDS_MEAN,))
+    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+    time_step = ts.restart(observations, batch_size=2)
+    action_step = policy.action(time_step, seed=1)
+    # Initialize all variables
+    self.evaluate(tf.compat.v1.global_variables_initializer())
+    actions = self.evaluate(action_step.action)
+    self.assertAllEqual(actions, [1, 2])
+
+  def testSomeSmallNumSamples(self):
+    # When some action has a much smaller number of samples, it should be chosen
+    # more frequently than other actions.
+    num_samples_list = []
+    for k in range(3):
+      num_samples_list.append(
+          tf.compat.v2.Variable(
+              tf.zeros([], dtype=tf.int32), name='num_samples_{}'.format(k)))
+    num_samples_list[0].assign_add(tf.int32.max - 10)
+    num_samples_list[1].assign_add(1)
+    num_samples_list[2].assign_add(tf.int32.max - 10)
+    policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
+        self._time_step_spec,
+        self._action_spec,
+        reward_network=DummyNet(self._obs_spec),
+        boltzmann_gumbel_exploration_constant=10.0,
+        num_samples_list=num_samples_list,
+        emit_policy_info=(utils.InfoFields.PREDICTED_REWARDS_MEAN,))
+    batch_size = 3000
+    observations = tf.constant([[1, 2]] * batch_size, dtype=tf.float32)
+    time_step = ts.restart(observations, batch_size=batch_size)
+    action_step = policy.action(time_step, seed=1)
+    # Initialize all variables
+    self.evaluate(tf.compat.v1.global_variables_initializer())
+    actions = self.evaluate(action_step.action)
+    self.assertAllInSet(actions, [0, 1, 2])
+    action_counts = {action: np.sum(actions == action) for action in range(3)}
+    self.assertAllLess([action_counts[0], action_counts[2]], action_counts[1])
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tf_agents/bandits/policies/greedy_reward_prediction_policy.py b/tf_agents/bandits/policies/greedy_reward_prediction_policy.py
@@ -15,11 +15,8 @@
 
 """Policy for greedy reward prediction."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
+import tensorflow_probability as tfp
 
 from tf_agents.bandits.policies import reward_prediction_base_policy
 from tf_agents.policies import utils as policy_utilities
@@ -29,7 +26,7 @@ class GreedyRewardPredictionPolicy(
     reward_prediction_base_policy.RewardPredictionBasePolicy):
   """Class to build GreedyNNPredictionPolicies."""
 
-  def _sample_action(self, mask, predicted_rewards):
+  def _action_distribution(self, mask, predicted_rewards):
     """Returns the action with largest predicted reward."""
     # Argmax.
     batch_size = tf.shape(predicted_rewards)[0]
@@ -44,6 +41,16 @@ def _sample_action(self, mask, predicted_rewards):
 
     bandit_policy_values = tf.fill([batch_size, 1],
                                    policy_utilities.BanditPolicyType.GREEDY)
-    # This deterministic policy chooses the greedy action with probability 1.
-    log_probability = tf.zeros([batch_size], tf.float32)
-    return actions, log_probability, bandit_policy_values
+    return tfp.distributions.Deterministic(loc=actions), bandit_policy_values
+
+  def _distribution(self, time_step, policy_state):
+    step = super(GreedyRewardPredictionPolicy,
+                 self)._distribution(time_step, policy_state)
+    # Greedy is deterministic, so we know the chosen arm features here. We
+    # save it here so the chosen arm features get correctly returned by
+    # `tf_agents.policies.epsilon_greey_policy.EpsilonGreedyPolicy` wrapping a
+    # `GreedyRewardPredictionPolicy` because `EpsilonGreedyPolicy` only accesses
+    # the `distribution` method of the wrapped policy via
+    # `tf_agents.policies.greedy_policy.GreedyPolicy`.
+    action = step.action.sample()
+    return self._maybe_save_chosen_arm_features(time_step, action, step)
diff --git a/tf_agents/bandits/policies/reward_prediction_base_policy.py b/tf_agents/bandits/policies/reward_prediction_base_policy.py