AIToolbox
A library that offers tools for AI problem solving.
QSoftmaxPolicyWrapper.hpp
Go to the documentation of this file.
1 #ifndef AI_TOOLBOX_BANDIT_Q_SOFTMAX_POLICY_WRAPPER_HEADER_FILE
2 #define AI_TOOLBOX_BANDIT_Q_SOFTMAX_POLICY_WRAPPER_HEADER_FILE
3 
7 
8 namespace AIToolbox::Bandit {
21  template <typename V, typename Gen>
23  public:
33  QSoftmaxPolicyWrapper(double t, V q, Vector & valueBuffer, std::vector<size_t> & buffer, Gen & gen);
34 
51  size_t sampleAction();
52 
62  double getActionProbability(size_t a) const;
63 
71  template <typename P>
72  void getPolicy(P && p) const;
73 
74  private:
75  double temperature_;
76  V q_;
77  Vector & valueBuffer_;
78  std::vector<size_t> & buffer_;
79  Gen & rand_;
80  };
81 
82  // If we get a temporary, we copy it.
83  template <typename V, typename Gen>
84  QSoftmaxPolicyWrapper(double, const V &&, Vector &, std::vector<size_t>&, Gen &) -> QSoftmaxPolicyWrapper<V, Gen>;
85 
86  // If we get a reference, we store a reference.
87  template <typename V, typename Gen>
88  QSoftmaxPolicyWrapper(double, const V &, Vector &, std::vector<size_t>&, Gen &) -> QSoftmaxPolicyWrapper<const V &, Gen>;
89 
90  template <typename V, typename Gen>
91  QSoftmaxPolicyWrapper<V, Gen>::QSoftmaxPolicyWrapper(double t, V q, Vector & vb, std::vector<size_t> & buffer, Gen & gen)
92  : temperature_(t), q_(std::move(q)), valueBuffer_(vb), buffer_(buffer), rand_(gen)
93  {
94  assert(static_cast<size_t>(q_.size()) == buffer_.size());
95  }
96 
97  template <typename V, typename Gen>
99  if ( checkEqualSmall(temperature_, 0.0) ) {
100  auto wrap = QGreedyPolicyWrapper(q_, buffer_, rand_);
101  return wrap.sampleAction();
102  }
103 
104  valueBuffer_ = (q_ / temperature_).array().exp();
105 
106  unsigned infinities = 0;
107  for ( size_t a = 0; a < buffer_.size(); ++a )
108  if ( std::isinf(valueBuffer_(a)) )
109  buffer_[infinities++] = a;
110 
111  if (infinities) {
112  auto pickDistribution = std::uniform_int_distribution<unsigned>(0, infinities-1);
113  unsigned selection = pickDistribution(rand_);
114 
115  return buffer_[selection];
116  } else {
117  valueBuffer_ /= valueBuffer_.sum();
118 
119  return sampleProbability(buffer_.size(), valueBuffer_, rand_);
120  }
121  }
122 
123  template <typename V, typename Gen>
125  if ( checkEqualSmall(temperature_, 0.0) ) {
126  auto wrap = QGreedyPolicyWrapper(q_, buffer_, rand_);
127  return wrap.getActionProbability(a);
128  }
129 
130  valueBuffer_ = (q_ / temperature_).array().exp();
131 
132  bool isAInfinite = false;
133  unsigned infinities = 0;
134  for ( size_t aa = 0; aa < buffer_.size(); ++aa ) {
135  if ( std::isinf(valueBuffer_(aa)) ) {
136  infinities++;
137  isAInfinite |= (aa == a);
138  }
139  }
140  if ( infinities ) {
141  if ( isAInfinite ) return 1.0 / infinities;
142  return 0.0;
143  }
144  return valueBuffer_(a) / valueBuffer_.sum();
145  }
146 
147  template <typename V, typename Gen>
148  template <typename P>
150  if ( checkEqualSmall(temperature_, 0.0) ) {
151  auto wrap = QGreedyPolicyWrapper(q_, buffer_, rand_);
152  return wrap.getPolicy(p);
153  }
154 
155  p = (q_ / temperature_).array().exp();
156 
157  unsigned infinities = 0;
158  double sum = 0.0;
159  for ( size_t a = 0; a < buffer_.size(); ++a ) {
160  sum += p[a];
161  if ( std::isinf(p[a]) )
162  infinities++;
163  }
164 
165  if ( infinities )
166  p = p.array().isInf().template cast<double>() / infinities;
167  else if ( checkEqualSmall(sum, 0.0) )
168  p.fill(1.0 / buffer_.size());
169  else
170  p /= sum;
171  }
172 }
173 
174 #endif
AIToolbox::Bandit::QSoftmaxPolicyWrapper
This class implements some basic softmax policy primitives.
Definition: QSoftmaxPolicyWrapper.hpp:22
Core.hpp
AIToolbox::Bandit::QGreedyPolicyWrapper
QGreedyPolicyWrapper(const V &&, std::vector< size_t > &, Gen &) -> QGreedyPolicyWrapper< V, Gen >
AIToolbox::Bandit::QSoftmaxPolicyWrapper
QSoftmaxPolicyWrapper(double, const V &&, Vector &, std::vector< size_t > &, Gen &) -> QSoftmaxPolicyWrapper< V, Gen >
AIToolbox::Bandit::QSoftmaxPolicyWrapper::getActionProbability
double getActionProbability(size_t a) const
This function returns the probability of taking the specified action in the specified state.
Definition: QSoftmaxPolicyWrapper.hpp:124
AIToolbox::Bandit::QSoftmaxPolicyWrapper::sampleAction
size_t sampleAction()
This function chooses an action for state s with probability dependent on value.
Definition: QSoftmaxPolicyWrapper.hpp:98
QGreedyPolicyWrapper.hpp
AIToolbox::Vector
Eigen::Matrix< double, Eigen::Dynamic, 1 > Vector
Definition: Types.hpp:16
AIToolbox::Bandit::QSoftmaxPolicyWrapper::getPolicy
void getPolicy(P &&p) const
This function writes in a vector all probabilities of the policy.
Definition: QSoftmaxPolicyWrapper.hpp:149
AIToolbox::Bandit
Definition: Experience.hpp:6
AIToolbox::checkEqualSmall
bool checkEqualSmall(const double a, const double b)
This function checks if two doubles near [0,1] are reasonably equal.
Definition: Core.hpp:45
AIToolbox::Bandit::QSoftmaxPolicyWrapper::QSoftmaxPolicyWrapper
QSoftmaxPolicyWrapper(double t, V q, Vector &valueBuffer, std::vector< size_t > &buffer, Gen &gen)
Basic constructor.
Definition: QSoftmaxPolicyWrapper.hpp:91
AIToolbox::sampleProbability
size_t sampleProbability(const size_t d, const T &in, G &generator)
This function samples an index from a probability vector.
Definition: Probability.hpp:188
Probability.hpp