Aakash-kaushik/mlpack/sac__impl_8hpp_source.html

 #ifndef MLPACK_METHODS_RL_SAC_IMPL_HPP
 #define MLPACK_METHODS_RL_SAC_IMPL_HPP

 #include <mlpack/prereqs.hpp>

 #include "sac.hpp"

 namespace mlpack {
 namespace rl {

 template <
   typename EnvironmentType,
   typename QNetworkType,
   typename PolicyNetworkType,
   typename UpdaterType,
   typename ReplayType
 >
 SAC<
   EnvironmentType,
   QNetworkType,
   PolicyNetworkType,
   UpdaterType,
   ReplayType
 >::SAC(TrainingConfig& config,
        QNetworkType& learningQ1Network,
        PolicyNetworkType& policyNetwork,
        ReplayType& replayMethod,
        UpdaterType qNetworkUpdater,
        UpdaterType policyNetworkUpdater,
        EnvironmentType environment):
   config(config),
   learningQ1Network(learningQ1Network),
   policyNetwork(policyNetwork),
   replayMethod(replayMethod),
   qNetworkUpdater(std::move(qNetworkUpdater)),
   #if ENS_VERSION_MAJOR >= 2
   qNetworkUpdatePolicy(NULL),
   #endif
   policyNetworkUpdater(std::move(policyNetworkUpdater)),
   #if ENS_VERSION_MAJOR >= 2
   policyNetworkUpdatePolicy(NULL),
   #endif
   environment(std::move(environment)),
   totalSteps(0),
   deterministic(false)
 {
   // Set up q-learning and policy networks.
   targetQ1Network = learningQ1Network;
   learningQ2Network = learningQ1Network;
   targetQ2Network = learningQ2Network;

   // Reset all the networks.
   // Note: the q and policy networks have an if condition before reset.
   // This is because we don't want to reset a loaded(possibly pretrained) model
   // passed using this constructor.
   if (learningQ1Network.Parameters().is_empty())
   {
     learningQ1Network.ResetParameters();
     learningQ2Network.ResetParameters();
   }
   if (policyNetwork.Parameters().is_empty())
     policyNetwork.ResetParameters();
   targetQ1Network.ResetParameters();
   targetQ2Network.ResetParameters();

   #if ENS_VERSION_MAJOR == 1
   this->qNetworkUpdater.Initialize(learningQ1Network.Parameters().n_rows,
                                    learningQ1Network.Parameters().n_cols);
   #else
   this->qNetworkUpdatePolicy = new typename UpdaterType::template
       Policy<arma::mat, arma::mat>(this->qNetworkUpdater,
                                    learningQ1Network.Parameters().n_rows,
                                    learningQ1Network.Parameters().n_cols);
   #endif

   #if ENS_VERSION_MAJOR == 1
   this->policyNetworkUpdater.Initialize(policyNetwork.Parameters().n_rows,
                                         policyNetwork.Parameters().n_cols);
   #else
   this->policyNetworkUpdatePolicy = new typename UpdaterType::template
       Policy<arma::mat, arma::mat>(this->policyNetworkUpdater,
                                    policyNetwork.Parameters().n_rows,
                                    policyNetwork.Parameters().n_cols);
   #endif

   // Copy over the learning networks to their respective target networks.
   targetQ1Network.Parameters() = learningQ1Network.Parameters();
   targetQ2Network.Parameters() = learningQ2Network.Parameters();
 }

 template <
   typename EnvironmentType,
   typename QNetworkType,
   typename PolicyNetworkType,
   typename UpdaterType,
   typename ReplayType
 >
 SAC<
   EnvironmentType,
   QNetworkType,
   PolicyNetworkType,
   UpdaterType,
   ReplayType
 >::~SAC()
 {
   #if ENS_VERSION_MAJOR >= 2
   delete qNetworkUpdatePolicy;
   delete policyNetworkUpdatePolicy;
   #endif
 }

 template <
   typename EnvironmentType,
   typename QNetworkType,
   typename PolicyNetworkType,
   typename UpdaterType,
   typename ReplayType
 >
 void SAC<
   EnvironmentType,
   QNetworkType,
   PolicyNetworkType,
   UpdaterType,
   ReplayType
 >::SoftUpdate(double rho)
 {
   targetQ1Network.Parameters() = (1 - rho) * targetQ1Network.Parameters() +
       rho * learningQ1Network.Parameters();
   targetQ2Network.Parameters() = (1 - rho) * targetQ2Network.Parameters() +
       rho * learningQ2Network.Parameters();
 }

 template <
   typename EnvironmentType,
   typename QNetworkType,
   typename PolicyNetworkType,
   typename UpdaterType,
   typename ReplayType
 >
 void SAC<
   EnvironmentType,
   QNetworkType,
   PolicyNetworkType,
   UpdaterType,
   ReplayType
 >::Update()
 {
   // Sample from previous experience.
   arma::mat sampledStates;
   std::vector<ActionType> sampledActions;
   arma::rowvec sampledRewards;
   arma::mat sampledNextStates;
   arma::irowvec isTerminal;

   replayMethod.Sample(sampledStates, sampledActions, sampledRewards,
       sampledNextStates, isTerminal);

   // Critic network update.

   // Get the actions for sampled next states, from policy.
   arma::mat nextStateActions;
   policyNetwork.Predict(sampledNextStates, nextStateActions);

   arma::mat targetQInput = arma::join_vert(nextStateActions,
       sampledNextStates);
   arma::rowvec Q1, Q2;
   targetQ1Network.Predict(targetQInput, Q1);
   targetQ2Network.Predict(targetQInput, Q2);
   arma::rowvec nextQ = sampledRewards + config.Discount() * ((1 - isTerminal)
       % arma::min(Q1, Q2));

   arma::mat sampledActionValues(action.size, sampledActions.size());
   for (size_t i = 0; i < sampledActions.size(); i++)
     sampledActionValues.col(i) = arma::conv_to<arma::colvec>::from
                                  (sampledActions[i].action);
   arma::mat learningQInput = arma::join_vert(sampledActionValues,
       sampledStates);
   learningQ1Network.Forward(learningQInput, Q1);
   learningQ2Network.Forward(learningQInput, Q2);

   arma::mat gradQ1Loss, gradQ2Loss;
   lossFunction.Backward(Q1, nextQ, gradQ1Loss);
   lossFunction.Backward(Q2, nextQ, gradQ2Loss);

   // Update the critic networks.
   arma::mat gradientQ1, gradientQ2;
   learningQ1Network.Backward(learningQInput, gradQ1Loss, gradientQ1);
   #if ENS_VERSION_MAJOR == 1
   qNetworkUpdater.Update(learningQ1Network.Parameters(), config.StepSize(),
       gradientQ1);
   #else
   qNetworkUpdatePolicy->Update(learningQ1Network.Parameters(),
       config.StepSize(), gradientQ1);
   #endif
   learningQ2Network.Backward(learningQInput, gradQ2Loss, gradientQ2);
   #if ENS_VERSION_MAJOR == 1
   qNetworkUpdater.Update(learningQ2Network.Parameters(), config.StepSize(),
       gradientQ2);
   #else
   qNetworkUpdatePolicy->Update(learningQ2Network.Parameters(),
       config.StepSize(), gradientQ2);
   #endif

   // Actor network update.

   arma::mat pi;
   policyNetwork.Predict(sampledStates, pi);

   arma::mat qInput = arma::join_vert(pi, sampledStates);
   learningQ1Network.Predict(qInput, Q1);
   learningQ2Network.Predict(qInput, Q2);

   // Get the size of the first hidden layer in the Q network.
   size_t hidden1 = boost::get<mlpack::ann::Linear<> *>
       (learningQ1Network.Model()[0])->OutputSize();

   arma::mat gradient;
   for (size_t i = 0; i < sampledStates.n_cols; i++)
   {
     arma::mat grad, gradQ, q;
     arma::colvec singleState = sampledStates.col(i);
     arma::colvec singlePi;
     policyNetwork.Forward(singleState, singlePi);
     arma::colvec input = arma::join_vert(singlePi, singleState);
     arma::mat weightLastLayer;

     if (Q1(i) < Q2(i))
     {
       learningQ1Network.Forward(input, q);
       learningQ1Network.Backward(input, -1, gradQ);
       weightLastLayer = arma::reshape(learningQ1Network.Parameters().
           rows(0, hidden1 * singlePi.n_rows - 1), hidden1, singlePi.n_rows);
     }
     else
     {
       learningQ2Network.Forward(input, q);
       learningQ2Network.Backward(input, -1, gradQ);
       weightLastLayer = arma::reshape(learningQ2Network.Parameters().
           rows(0, hidden1 * singlePi.n_rows - 1), hidden1, singlePi.n_rows);
     }

     arma::colvec gradQBias = gradQ(input.n_rows * hidden1, 0,
         arma::size(hidden1, 1));
     arma::mat gradPolicy = weightLastLayer.t() * gradQBias;
     policyNetwork.Backward(singleState, gradPolicy, grad);
     if (i == 0)
     {
       gradient.copy_size(grad);
       gradient.fill(0.0);
     }
     gradient += grad;
   }
   gradient /= sampledStates.n_cols;

   #if ENS_VERSION_MAJOR == 1
   policyUpdater.Update(policyNetwork.Parameters(), config.StepSize(), gradient);
   #else
   policyNetworkUpdatePolicy->Update(policyNetwork.Parameters(),
       config.StepSize(), gradient);
   #endif

   // Update target network
   if (totalSteps % config.TargetNetworkSyncInterval() == 0)
     SoftUpdate(config.Rho());
 }

 template <
   typename EnvironmentType,
   typename QNetworkType,
   typename PolicyNetworkType,
   typename UpdaterType,
   typename ReplayType
 >
 void SAC<
   EnvironmentType,
   QNetworkType,
   PolicyNetworkType,
   UpdaterType,
   ReplayType
 >::SelectAction()
 {
   // Get the action at current state, from policy.
   arma::colvec outputAction;
   policyNetwork.Predict(state.Encode(), outputAction);

   if (!deterministic)
   {
     arma::colvec noise = arma::randn<arma::colvec>(outputAction.n_rows) * 0.1;
     noise = arma::clamp(noise, -0.25, 0.25);
     outputAction = outputAction + noise;
   }
   action.action = arma::conv_to<std::vector<double>>::from(outputAction);
 }

 template <
   typename EnvironmentType,
   typename QNetworkType,
   typename PolicyNetworkType,
   typename UpdaterType,
   typename ReplayType
 >
 double SAC<
   EnvironmentType,
   QNetworkType,
   PolicyNetworkType,
   UpdaterType,
   ReplayType
 >::Episode()
 {
   // Get the initial state from environment.
   state = environment.InitialSample();

   // Track the steps in this episode.
   size_t steps = 0;

   // Track the return of this episode.
   double totalReturn = 0.0;

   // Running until get to the terminal state.
   while (!environment.IsTerminal(state))
   {
     if (config.StepLimit() && steps >= config.StepLimit())
       break;
     SelectAction();

     // Interact with the environment to advance to next state.
     StateType nextState;
     double reward = environment.Sample(state, action, nextState);

     totalReturn += reward;
     steps++;
     totalSteps++;

     // Store the transition for replay.
     replayMethod.Store(state, action, reward, nextState,
         environment.IsTerminal(nextState), config.Discount());

     // Update current state.
     state = nextState;

     if (deterministic || totalSteps < config.ExplorationSteps())
       continue;
     for (size_t i = 0; i < config.UpdateInterval(); i++)
       Update();
   }
   return totalReturn;
 }

 } // namespace rl
 } // namespace mlpack
 #endif
mlpack::rl::TrainingConfig::TargetNetworkSyncInterval
size_t TargetNetworkSyncInterval() const
Get the interval for syncing target network.
Definition: training_config.hpp:84

sac.hpp

mlpack
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1

mlpack::rl::SAC
Implementation of Soft Actor-Critic, a model-free off-policy actor-critic based deep reinforcement le...
Definition: sac.hpp:64

mlpack::ann::MeanSquaredError::Backward
void Backward(const PredictionType &prediction, const TargetType &target, LossType &loss)
Ordinary feed backward pass of a neural network.
Definition: mean_squared_error_impl.hpp:39

prereqs.hpp
The core includes that mlpack expects; standard C++ includes and Armadillo.

std
Definition: pointer_wrapper.hpp:23

mlpack::rl::TrainingConfig::UpdateInterval
size_t UpdateInterval() const
Get the update interval.
Definition: training_config.hpp:79

mlpack::rl::TrainingConfig::StepSize
double StepSize() const
Get the step size of the optimizer.
Definition: training_config.hpp:103

mlpack::rl::TrainingConfig::Rho
double Rho() const
Get the rho value for sac.
Definition: training_config.hpp:148

mlpack::rl::SAC::StateType
typename EnvironmentType::State StateType
Convenient typedef for state.
Definition: sac.hpp:68

mlpack::rl::SAC::SoftUpdate
void SoftUpdate(double rho)
Softly update the learning Q network parameters to the target Q network parameters.
Definition: sac_impl.hpp:137

mlpack::rl::TrainingConfig::StepLimit
size_t StepLimit() const
Get the maximum steps of each episode.
Definition: training_config.hpp:90

mlpack::rl::SAC::Update
void Update()
Update the Q and policy networks.
Definition: sac_impl.hpp:158

mlpack::rl::TrainingConfig
Definition: training_config.hpp:19

mlpack::rl::SAC::~SAC
~SAC()
Clean memory.
Definition: sac_impl.hpp:116

mlpack::rl::SAC::SelectAction
void SelectAction()
Select an action, given an agent.
Definition: sac_impl.hpp:292

mlpack::rl::SAC::Episode
double Episode()
Execute an episode.
Definition: sac_impl.hpp:320

mlpack::rl::TrainingConfig::Discount
double Discount() const
Get the discount rate for future reward.
Definition: training_config.hpp:108

mlpack::rl::TrainingConfig::ExplorationSteps
size_t ExplorationSteps() const
Get the exploration steps.
Definition: training_config.hpp:98