AIToolbox
A library that offers tools for AI problem solving.
OffPolicyTemplate.hpp
Go to the documentation of this file.
1 #ifndef AI_TOOLBOX_MDP_OFF_POLICY_TEMPLATE_HEADER_FILE
2 #define AI_TOOLBOX_MDP_OFF_POLICY_TEMPLATE_HEADER_FILE
3 
6 
7 namespace AIToolbox::MDP {
11  class OffPolicyBase {
12  public:
13  using Trace = std::tuple<size_t, size_t, double>;
14  using Traces = std::vector<Trace>;
15 
25  OffPolicyBase(size_t s, size_t a, double discount = 1.0, double alpha = 0.1, double tolerance = 0.001);
26 
50  void setLearningRate(double a);
51 
57  double getLearningRate() const;
58 
70  void setDiscount(double d);
71 
77  double getDiscount() const;
78 
87  void setTolerance(double t);
88 
94  double getTolerance() const;
95 
99  void clearTraces();
100 
106  const Traces & getTraces() const;
107 
117  void setTraces(const Traces & t);
118 
124  size_t getS() const;
125 
131  size_t getA() const;
132 
141  const QFunction & getQFunction() const;
142 
151  void setQFunction(const QFunction & qfun);
152 
153  protected:
154  size_t S, A;
156 
169  void updateTraces(size_t s, size_t a, double error, double traceDiscount);
170 
173  };
174 
214  template <typename Derived>
216  public:
218 
227  OffPolicyEvaluation(const PolicyInterface & target, double discount = 1.0,
228  double alpha = 0.1, double tolerance = 0.001);
229 
242  void stepUpdateQ(const size_t s, const size_t a, const size_t s1, const double rew);
243 
244  protected:
246  };
247 
293  template <typename Derived>
295  public:
297 
308  OffPolicyControl(size_t s, size_t a, double discount = 1.0, double alpha = 0.1,
309  double tolerance = 0.001, double epsilon = 0.1);
310 
323  void stepUpdateQ(const size_t s, const size_t a, const size_t s1, const double rew);
324 
339  void setEpsilon(double e);
340 
346  double getEpsilon() const;
347 
348  protected:
349  double epsilon_;
350  };
351 
352  template <typename Derived>
353  void OffPolicyEvaluation<Derived>::stepUpdateQ(const size_t s, const size_t a, const size_t s1, const double rew) {
354  auto expectedQ = 0.0;
355  for (size_t a = 0; a < A; ++a)
356  expectedQ += q_(s1, a) * target_.getActionProbability(s1, a);
357 
358  const auto error = alpha_ * ( rew + discount_ * expectedQ - q_(s, a) );
359  const auto traceDiscount = discount_ * static_cast<Derived*>(this)->getTraceDiscount(s, a, s1, rew);
360 
361  updateTraces(s, a, error, traceDiscount);
362  }
363 
364  template <typename Derived>
365  void OffPolicyControl<Derived>::stepUpdateQ(const size_t s, const size_t a, const size_t s1, const double rew) {
366  // The basic idea here is that differently from the evaluation, we want
367  // to do a maximization. At the same time, to work we need to "round
368  // the edges" a bit, and that's why we assume an epsilon-greedy policy.
369  //
370  // The expected value we can compute easily since each action has the
371  // same probability of being chosen, but for the greedy one which is
372  // more likely.
373  size_t maxA;
374  double expectedQ = 0.0;
375  double maxV = std::numeric_limits<double>::lowest();
376  for (size_t aa = 0; aa < A; ++aa) {
377  expectedQ += q_(s1, aa);
378  if (maxV < q_(s1, aa)) {
379  maxA = aa;
380  maxV = q_(s1, aa);
381  }
382  }
383  expectedQ *= epsilon_ / A;
384  expectedQ += (1.0 - epsilon_) * maxV;
385 
386  const auto error = alpha_ * ( rew + discount_ * expectedQ - q_(s, a) );
387  const auto traceDiscount = discount_ * static_cast<Derived*>(this)->getTraceDiscount(s, a, s1, rew, maxA);
388 
389  updateTraces(s, a, error, traceDiscount);
390  }
391 
392  template <typename Derived>
394  const PolicyInterface & target,
395  const double discount, const double alpha, const double tolerance
396  ) :
397  Parent(target.getS(), target.getA(), discount, alpha, tolerance),
398  target_(target) {}
399 
400  template <typename Derived>
402  const size_t s, const size_t a, const double discount,
403  const double alpha, const double tolerance, const double epsilon
404  ) :
405  Parent(s, a, discount, alpha, tolerance)
406  {
407  setEpsilon(epsilon);
408  }
409 
410  template <typename Derived>
412  if ( e < 0.0 || e > 1.0 ) throw std::invalid_argument("Epsilon must be >= 0 and <= 1");
413  epsilon_ = e;
414  }
415 
416  template <typename Derived>
418  return epsilon_;
419  }
420 }
421 
422 #endif
AIToolbox::MDP::OffPolicyBase::clearTraces
void clearTraces()
This function clears the already set traces.
AIToolbox::MDP::OffPolicyControl
This class is a general version of off-policy control.
Definition: OffPolicyTemplate.hpp:294
AIToolbox::MDP::OffPolicyControl::stepUpdateQ
void stepUpdateQ(const size_t s, const size_t a, const size_t s1, const double rew)
This function updates the internal QFunction using the discount set during construction.
Definition: OffPolicyTemplate.hpp:365
AIToolbox::MDP::OffPolicyBase::getQFunction
const QFunction & getQFunction() const
This function returns a reference to the internal QFunction.
PolicyInterface.hpp
AIToolbox::MDP::QFunction
Matrix2D QFunction
Definition: Types.hpp:52
AIToolbox::MDP::OffPolicyBase::q_
QFunction q_
Definition: OffPolicyTemplate.hpp:171
AIToolbox::MDP::OffPolicyControl::getEpsilon
double getEpsilon() const
This function will return the currently set epsilon parameter.
Definition: OffPolicyTemplate.hpp:417
AIToolbox::MDP::OffPolicyEvaluation::OffPolicyEvaluation
OffPolicyEvaluation(const PolicyInterface &target, double discount=1.0, double alpha=0.1, double tolerance=0.001)
Basic constructor.
Definition: OffPolicyTemplate.hpp:393
AIToolbox::MDP::OffPolicyBase::setTolerance
void setTolerance(double t)
This function sets the trace cutoff parameter.
AIToolbox::MDP::OffPolicyBase::updateTraces
void updateTraces(size_t s, size_t a, double error, double traceDiscount)
This function updates the traces using the input data.
AIToolbox::MDP::OffPolicyEvaluation::stepUpdateQ
void stepUpdateQ(const size_t s, const size_t a, const size_t s1, const double rew)
This function updates the internal QFunction using the discount set during construction.
Definition: OffPolicyTemplate.hpp:353
AIToolbox::MDP::OffPolicyBase::Trace
std::tuple< size_t, size_t, double > Trace
Definition: OffPolicyTemplate.hpp:13
AIToolbox::MDP::OffPolicyBase::tolerance_
double tolerance_
Definition: OffPolicyTemplate.hpp:155
AIToolbox::MDP::OffPolicyBase::getTraces
const Traces & getTraces() const
This function returns the currently set traces.
AIToolbox::MDP::OffPolicyBase::getA
size_t getA() const
This function returns the number of actions on which QLearning is working.
AIToolbox::MDP::OffPolicyBase::A
size_t A
Definition: OffPolicyTemplate.hpp:154
AIToolbox::MDP::OffPolicyBase::Traces
std::vector< Trace > Traces
Definition: OffPolicyTemplate.hpp:14
AIToolbox::MDP
Definition: DoubleQLearning.hpp:10
AIToolbox::MDP::OffPolicyBase::getLearningRate
double getLearningRate() const
This function will return the current set learning rate parameter.
AIToolbox::MDP::OffPolicyBase::setDiscount
void setDiscount(double d)
This function sets the new discount parameter.
AIToolbox::MDP::OffPolicyBase
This class contains all the boilerplates for off-policy methods.
Definition: OffPolicyTemplate.hpp:11
AIToolbox::MDP::OffPolicyBase::getTolerance
double getTolerance() const
This function returns the currently set trace cutoff parameter.
AIToolbox::MDP::OffPolicyBase::getS
size_t getS() const
This function returns the number of states on which QLearning is working.
AIToolbox::MDP::OffPolicyEvaluation::target_
const PolicyInterface & target_
Definition: OffPolicyTemplate.hpp:245
AIToolbox::MDP::OffPolicyBase::alpha_
double alpha_
Definition: OffPolicyTemplate.hpp:155
AIToolbox::MDP::OffPolicyBase::traces_
Traces traces_
Definition: OffPolicyTemplate.hpp:172
AIToolbox::MDP::OffPolicyBase::setQFunction
void setQFunction(const QFunction &qfun)
This function allows to directly set the internal QFunction.
AIToolbox::MDP::OffPolicyEvaluation
This class is a general version of off-policy evaluation.
Definition: OffPolicyTemplate.hpp:215
AIToolbox::MDP::OffPolicyBase::S
size_t S
Definition: OffPolicyTemplate.hpp:154
AIToolbox::MDP::OffPolicyBase::setTraces
void setTraces(const Traces &t)
This function sets the currently set traces.
AIToolbox::MDP::OffPolicyBase::discount_
double discount_
Definition: OffPolicyTemplate.hpp:155
AIToolbox::MDP::OffPolicyBase::setLearningRate
void setLearningRate(double a)
This function sets the learning rate parameter.
Types.hpp
AIToolbox::MDP::OffPolicyBase::getDiscount
double getDiscount() const
This function returns the currently set discount parameter.
AIToolbox::MDP::OffPolicyControl::OffPolicyControl
OffPolicyControl(size_t s, size_t a, double discount=1.0, double alpha=0.1, double tolerance=0.001, double epsilon=0.1)
Basic constructor.
Definition: OffPolicyTemplate.hpp:401
AIToolbox::MDP::PolicyInterface
Simple typedef for most of MDP's policy needs.
Definition: PolicyInterface.hpp:11
AIToolbox::MDP::OffPolicyControl::epsilon_
double epsilon_
Definition: OffPolicyTemplate.hpp:349
AIToolbox::MDP::OffPolicyControl::setEpsilon
void setEpsilon(double e)
This function sets the epsilon parameter.
Definition: OffPolicyTemplate.hpp:411
AIToolbox::MDP::OffPolicyBase::OffPolicyBase
OffPolicyBase(size_t s, size_t a, double discount=1.0, double alpha=0.1, double tolerance=0.001)
Basic construtor.