Aakash-kaushik/mlpack/q__learning__impl_8hpp_source.html

 #ifndef MLPACK_METHODS_RL_Q_LEARNING_IMPL_HPP
 #define MLPACK_METHODS_RL_Q_LEARNING_IMPL_HPP

 #include "q_learning.hpp"

 namespace mlpack {
 namespace rl {

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename PolicyType,
   typename ReplayType
 >
 QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   PolicyType,
   ReplayType
 >::QLearning(TrainingConfig& config,
              NetworkType& network,
              PolicyType& policy,
              ReplayType& replayMethod,
              UpdaterType updater,
              EnvironmentType environment):
     config(config),
     learningNetwork(network),
     policy(policy),
     replayMethod(replayMethod),
     updater(std::move(updater)),
     #if ENS_VERSION_MAJOR >= 2
     updatePolicy(NULL),
     #endif
     environment(std::move(environment)),
     totalSteps(0),
     deterministic(false)
 {
   // To copy over the network structure.
   targetNetwork = learningNetwork;

   // Set up q-learning network.
   if (learningNetwork.Parameters().is_empty())
     learningNetwork.ResetParameters();

   targetNetwork.ResetParameters();

   #if ENS_VERSION_MAJOR == 1
   this->updater.Initialize(learningNetwork.Parameters().n_rows,
                            learningNetwork.Parameters().n_cols);
   #else
   this->updatePolicy = new typename UpdaterType::template
       Policy<arma::mat, arma::mat>(this->updater,
                                    learningNetwork.Parameters().n_rows,
                                    learningNetwork.Parameters().n_cols);
   #endif

   // Initialize the target network with the parameters of learning network.
   targetNetwork.Parameters() = learningNetwork.Parameters();
 }

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename PolicyType,
   typename ReplayType
 >
 QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   PolicyType,
   ReplayType
 >::~QLearning()
 {
   #if ENS_VERSION_MAJOR >= 2
   delete updatePolicy;
   #endif
 }

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename PolicyType,
   typename ReplayType
 >
 arma::Col<size_t> QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   PolicyType,
   ReplayType
 >::BestAction(const arma::mat& actionValues)
 {
   // Take best possible action at a particular instance.
   arma::Col<size_t> bestActions(actionValues.n_cols);
   arma::rowvec maxActionValues = arma::max(actionValues, 0);
   for (size_t i = 0; i < actionValues.n_cols; ++i)
   {
     bestActions(i) = arma::as_scalar(
         arma::find(actionValues.col(i) == maxActionValues[i], 1));
   }
   return bestActions;
 };

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename BehaviorPolicyType,
   typename ReplayType
 >
 void QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   BehaviorPolicyType,
   ReplayType
 >::TrainAgent()
 {
   // Start experience replay.

   // Sample from previous experience.
   arma::mat sampledStates;
   std::vector<ActionType> sampledActions;
   arma::rowvec sampledRewards;
   arma::mat sampledNextStates;
   arma::irowvec isTerminal;

   replayMethod.Sample(sampledStates, sampledActions, sampledRewards,
       sampledNextStates, isTerminal);

   // Compute action value for next state with target network.
   arma::mat nextActionValues;
   targetNetwork.Predict(sampledNextStates, nextActionValues);

   arma::Col<size_t> bestActions;
   if (config.DoubleQLearning())
   {
     // If use double Q-Learning, use learning network to select the best action.
     arma::mat nextActionValues;
     learningNetwork.Predict(sampledNextStates, nextActionValues);
     bestActions = BestAction(nextActionValues);
   }
   else
   {
     bestActions = BestAction(nextActionValues);
   }

   // Compute the update target.
   arma::mat target;
   learningNetwork.Forward(sampledStates, target);

   double discount = std::pow(config.Discount(), replayMethod.NSteps());

   for (size_t i = 0; i < sampledNextStates.n_cols; ++i)
   {
     target(sampledActions[i].action, i) = sampledRewards(i) + discount *
         nextActionValues(bestActions(i), i) * (1 - isTerminal[i]);
   }

   // Learn from experience.
   arma::mat gradients;
   learningNetwork.Backward(sampledStates, target, gradients);

   replayMethod.Update(target, sampledActions, nextActionValues, gradients);

   #if ENS_VERSION_MAJOR == 1
   updater.Update(learningNetwork.Parameters(), config.StepSize(), gradients);
   #else
   updatePolicy->Update(learningNetwork.Parameters(), config.StepSize(),
       gradients);
   #endif

   if (config.NoisyQLearning() == true)
   {
     learningNetwork.ResetNoise();
     targetNetwork.ResetNoise();
   }
   // Update target network.
   if (totalSteps % config.TargetNetworkSyncInterval() == 0)
     targetNetwork.Parameters() = learningNetwork.Parameters();

   if (totalSteps > config.ExplorationSteps())
     policy.Anneal();
 }

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename BehaviorPolicyType,
   typename ReplayType
 >
 void QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   BehaviorPolicyType,
   ReplayType
 >::TrainCategoricalAgent()
 {
   // Start experience replay.

   // Sample from previous experience.
   arma::mat sampledStates;
   std::vector<ActionType> sampledActions;
   arma::rowvec sampledRewards;
   arma::mat sampledNextStates;
   arma::irowvec isTerminal;

   replayMethod.Sample(sampledStates, sampledActions, sampledRewards,
       sampledNextStates, isTerminal);

   size_t atomSize = config.AtomSize();
   arma::colvec support = arma::linspace<arma::colvec>(config.VMin(),
       config.VMax(), atomSize);

   size_t batchSize = sampledNextStates.n_cols;

   // Compute action value for next state with target network.
   arma::mat nextActionValues;
   targetNetwork.Predict(sampledNextStates, nextActionValues);

   arma::Col<size_t> nextAction;
   if (config.DoubleQLearning())
   {
     // If use double Q-Learning, use learning network to select the best action.
     arma::mat nextActionValues;
     learningNetwork.Predict(sampledNextStates, nextActionValues);
     nextAction = BestAction(nextActionValues);
   }
   else
   {
     nextAction = BestAction(nextActionValues);
   }

   arma::mat nextDists, nextDist(atomSize, batchSize);
   targetNetwork.Forward(sampledNextStates, nextDists);
   for (size_t i = 0; i < batchSize; ++i)
   {
     nextDist.col(i) = nextDists(nextAction(i) * atomSize, i,
         arma::size(atomSize, 1));
   }

   arma::mat tZ = (arma::conv_to<arma::mat>::from(config.Discount() *
       (support * (1 - isTerminal))).each_row() + sampledRewards);
   tZ = arma::clamp(tZ, config.VMin(), config.VMax());
   arma::mat b = (tZ - config.VMin()) / (config.VMax() - config.VMin()) *
       (atomSize - 1);
   arma::mat l = arma::floor(b);
   arma::mat u = arma::ceil(b);

   arma::mat projDistUpper = nextDist % (u - b);
   arma::mat projDistLower = nextDist % (b - l);

   arma::mat projDist = arma::zeros<arma::mat>(arma::size(nextDist));
   for (size_t batchNo = 0; batchNo < batchSize; batchNo++)
   {
     for (size_t j = 0; j < atomSize; j++)
     {
       projDist(l(j, batchNo), batchNo) += projDistUpper(j, batchNo);
       projDist(u(j, batchNo), batchNo) += projDistLower(j, batchNo);
     }
   }
   arma::mat dists;
   learningNetwork.Forward(sampledStates, dists);
   arma::mat lossGradients = arma::zeros<arma::mat>(arma::size(dists));
   for (size_t i = 0; i < batchSize; ++i)
   {
     lossGradients(sampledActions[i].action * atomSize, i,
         arma::size(atomSize, 1)) = -(projDist.col(i) / (1e-10 + dists(
         sampledActions[i].action * atomSize, i, arma::size(atomSize, 1))));
   }
   // Learn from experience.
   arma::mat gradients;
   learningNetwork.Backward(sampledStates, lossGradients, gradients);

   #if ENS_VERSION_MAJOR == 1
   updater.Update(learningNetwork.Parameters(), config.StepSize(), gradients);
   #else
   updatePolicy->Update(learningNetwork.Parameters(), config.StepSize(),
       gradients);
   #endif

   if (config.NoisyQLearning() == true)
   {
     learningNetwork.ResetNoise();
     targetNetwork.ResetNoise();
   }
   // Update target network.
   if (totalSteps % config.TargetNetworkSyncInterval() == 0)
     targetNetwork.Parameters() = learningNetwork.Parameters();

   if (totalSteps > config.ExplorationSteps())
     policy.Anneal();
 }

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename BehaviorPolicyType,
   typename ReplayType
 >
 void QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   BehaviorPolicyType,
   ReplayType
 >::SelectAction()
 {
   // Get the action value for each action at current state.
   arma::colvec actionValue;
   learningNetwork.Predict(state.Encode(), actionValue);

   // Select an action according to the behavior policy.
   action = policy.Sample(actionValue, deterministic, config.NoisyQLearning());
 }

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename BehaviorPolicyType,
   typename ReplayType
 >
 double QLearning<
   EnvironmentType,
   NetworkType,
   UpdaterType,
   BehaviorPolicyType,
   ReplayType
 >::Episode()
 {
   // Get the initial state from environment.
   state = environment.InitialSample();

   // Track the return of this episode.
   double totalReturn = 0.0;

   // Running until get to the terminal state.
   while (!environment.IsTerminal(state))
   {
     SelectAction();

     // Interact with the environment to advance to next state.
     StateType nextState;
     double reward = environment.Sample(state, action, nextState);

     totalReturn += reward;
     totalSteps++;

     // Store the transition for replay.
     replayMethod.Store(state, action, reward, nextState,
         environment.IsTerminal(nextState), config.Discount());
     // Update current state.
     state = nextState;

     if (deterministic || totalSteps < config.ExplorationSteps())
       continue;
     if (config.IsCategorical())
       TrainCategoricalAgent();
     else
       TrainAgent();
   }
   return totalReturn;
 }

 } // namespace rl
 } // namespace mlpack

 #endif
mlpack::rl::TrainingConfig::DoubleQLearning
bool DoubleQLearning() const
Get the indicator of double q-learning.
Definition: training_config.hpp:118

mlpack::rl::TrainingConfig::TargetNetworkSyncInterval
size_t TargetNetworkSyncInterval() const
Get the interval for syncing target network.
Definition: training_config.hpp:84

mlpack::rl::QLearning::StateType
typename EnvironmentType::State StateType
Convenient typedef for state.
Definition: q_learning.hpp:63

mlpack
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1

mlpack::rl::TrainingConfig::VMax
double VMax() const
Get the maximum value for support.
Definition: training_config.hpp:143

mlpack::rl::QLearning::~QLearning
~QLearning()
Clean memory.
Definition: q_learning_impl.hpp:87

std
Definition: pointer_wrapper.hpp:23

mlpack::rl::QLearning::SelectAction
void SelectAction()
Select an action, given an agent.
Definition: q_learning_impl.hpp:331

mlpack::rl::TrainingConfig::StepSize
double StepSize() const
Get the step size of the optimizer.
Definition: training_config.hpp:103

mlpack::rl::TrainingConfig::VMin
double VMin() const
Get the minimum value for support.
Definition: training_config.hpp:138

mlpack::rl::QLearning::Episode
double Episode()
Execute an episode.
Definition: q_learning_impl.hpp:354

mlpack::rl::QLearning::TrainAgent
void TrainAgent()
Trains the DQN agent(non-categorical).
Definition: q_learning_impl.hpp:133

mlpack::rl::TrainingConfig::NoisyQLearning
bool NoisyQLearning() const
Get the indicator of noisy q-learning.
Definition: training_config.hpp:123

mlpack::rl::TrainingConfig
Definition: training_config.hpp:19

mlpack::rl::QLearning
Implementation of various Q-Learning algorithms, such as DQN, double DQN.
Definition: q_learning.hpp:59

mlpack::rl::TrainingConfig::Discount
double Discount() const
Get the discount rate for future reward.
Definition: training_config.hpp:108

mlpack::rl::TrainingConfig::ExplorationSteps
size_t ExplorationSteps() const
Get the exploration steps.
Definition: training_config.hpp:98

mlpack::rl::TrainingConfig::IsCategorical
bool IsCategorical() const
Get the indicator of categorical q-learning.
Definition: training_config.hpp:128

mlpack::rl::QLearning::TrainCategoricalAgent
void TrainCategoricalAgent()
Trains the DQN agent of categorical type.
Definition: q_learning_impl.hpp:220

q_learning.hpp

mlpack::rl::TrainingConfig::AtomSize
size_t AtomSize() const
Get the number of atoms.
Definition: training_config.hpp:133