AIToolbox
A library that offers tools for AI problem solving.
CooperativePrioritizedSweeping.hpp
Go to the documentation of this file.
1 #ifndef AI_TOOLBOX_FACTORED_MDP_COOPERATIVE_PRIORITIZED_SWEEPING_HEADER_FILE
2 #define AI_TOOLBOX_FACTORED_MDP_COOPERATIVE_PRIORITIZED_SWEEPING_HEADER_FILE
3 
4 #include <AIToolbox/Seeder.hpp>
12 
32  template <typename M, typename Maximizer = Bandit::VariableElimination>
34  public:
43  CooperativePrioritizedSweeping(const M & m, std::vector<std::vector<size_t>> basisDomains, double alpha = 0.3, double theta = 0.001);
44 
53  void stepUpdateQ(const State & s, const Action & a, const State & s1, const Rewards & r);
54 
63  void batchUpdateQ(const unsigned N = 50);
64 
73 
78 
82  const QFunction & getQFunction() const;
83 
91  void setQFunction(double val);
92 
93  private:
102  void updateQ(const State & s, const Action & a, const State & s1, const Rewards & r);
103 
109  void addToQueue(const State & s1);
110 
111  const M & model_;
112  double alpha_, theta_;
113 
114  std::vector<std::vector<size_t>> qDomains_;
115  Vector rewardWeights_, deltaStorage_, rewardStorage_;
116 
117  QFunction q_;
119  CPSQueue queue_;
120 
121  mutable RandomEngine rand_;
122  };
123 
124  template <typename M, typename Maximizer>
125  CooperativePrioritizedSweeping<M, Maximizer>::CooperativePrioritizedSweeping(const M & m, std::vector<std::vector<size_t>> basisDomains, double alpha, double theta) :
126  model_(m),
127  alpha_(alpha), theta_(theta),
128  qDomains_(std::move(basisDomains)),
129  rewardWeights_(model_.getS().size()),
130  deltaStorage_(model_.getS().size()),
131  rewardStorage_(model_.getS().size()),
132  q_(makeQFunction(model_.getGraph(), qDomains_)),
133  gp_(model_.getS(), model_.getA(), q_),
134  queue_(model_.getGraph()),
135  rand_(Seeder::getSeed())
136  {
137  // We weight the rewards so that they are split correctly between the
138  // components of the QFunction.
139  // Note that unused reward weights might result in r/0 or 0/0
140  // operations, but since then we won't be using those elements
141  // anyway it's not a problem.
142  rewardWeights_.setZero();
143  deltaStorage_.setZero();
144  // We don't need to zero rewardStorage_
145 
146  // We weight rewards based on the state features of each Q factor
147  for (const auto & q : q_.bases)
148  for (const auto d : q.tag)
149  rewardWeights_[d] += 1.0;
150  }
151 
152  template <typename M, typename Maximizer>
153  void CooperativePrioritizedSweeping<M, Maximizer>::stepUpdateQ(const State & s, const Action & a, const State & s1, const Rewards & r) {
154  updateQ(s, a, s1, r);
155  addToQueue(s);
156  }
157 
158  template <typename M, typename Maximizer>
160  // Initialize some variables to avoid reallocations
161  State s(model_.getS().size());
162  State s1(model_.getS().size());
163  Action a(model_.getA().size());
164  Rewards rews(model_.getS().size());
165 
166  for (size_t n = 0; n < N; ++n) {
167  if (!queue_.getNonZeroPriorities()) return;
168 
169  queue_.reconstruct(s, a);
170 
171  // Filling randomly the missing elements.
172  for (size_t i = 0; i < s.size(); ++i) {
173  if (s[i] == model_.getS()[i]) {
174  std::uniform_int_distribution<size_t> dist(0, model_.getS()[i]-1);
175  s[i] = dist(rand_);
176  }
177  }
178  for (size_t i = 0; i < a.size(); ++i) {
179  if (a[i] == model_.getA()[i]) {
180  std::uniform_int_distribution<size_t> dist(0, model_.getA()[i]-1);
181  a[i] = dist(rand_);
182  }
183  }
184 
185  // Finally, sample a new s1/rews from the model.
186  model_.sampleSRs(s, a, &s1, &rews);
187 
188  // And use them to update Q.
189  updateQ(s, a, s1, rews);
190 
191  // Update the queue
192  addToQueue(s);
193  }
194  }
195 
196  template <typename M, typename Maximizer>
197  void CooperativePrioritizedSweeping<M, Maximizer>::updateQ(const State & s, const Action & a, const State & s1, const Rewards & r) {
198  // Compute optimal action to do Q-Learning update.
199  const auto a1 = gp_.sampleAction(s1);
200 
201  // The standard Q-update is in the form:
202  //
203  // Q(s,a) += alpha * ( R(s,a) + gamma * Q(s', a') - Q(s,a) )
204  //
205  // Since our Q-function is factored, we want to split the rewards per
206  // state feature (similar to SparseCooperativeQLearning).
207 
208  // Start with R
209  rewardStorage_ = r.array();
210  // Now go over the factored Q-function for the rest
211  for (const auto & q : q_.bases) {
212  const auto sid = toIndexPartial(q.tag, model_.getS(), s);
213  const auto aid = toIndexPartial(q.actionTag, model_.getA(), a);
214 
215  const auto s1id = toIndexPartial(q.tag, model_.getS(), s1);
216  const auto a1id = toIndexPartial(q.actionTag, model_.getA(), a1);
217 
218  // gamma * Q(s', a') - Q(s, a)
219  // We normalize it per state features, since we distribute the diff to all
220  // elements of rewardStorage_.
221  const auto diff = (model_.getDiscount() * q.values(s1id, a1id) - q.values(sid, aid)) / q.tag.size();
222 
223  // Apply the values to each state feature that applies to this Q factor.
224  // R(s,a) + ...
225  for (const auto s : q.tag)
226  rewardStorage_[s] += diff;
227  }
228 
229  // Normalize all values based on Q-factors
230  rewardStorage_.array() /= rewardWeights_.array();
231  rewardStorage_.array() *= alpha_;
232 
233  // We update each Q factor separately.
234  for (size_t i = 0; i < q_.bases.size(); ++i) {
235  auto & q = q_.bases[i];
236 
237  const auto sid = toIndexPartial(q.tag, model_.getS(), s);
238  const auto aid = toIndexPartial(q.actionTag, model_.getA(), a);
239 
240  // Compute numerical reward from the components children of this Q
241  // factor.
242  double td = 0.0;
243  for (const auto s : q.tag)
244  td += rewardStorage_[s];
245 
246  q.values(sid, aid) += td;
247 
248  // Split the delta to each element referenced by this Q factor.
249  // Note that we add to the storage, which is only cleared once we
250  // call addToQueue; this means that multiple calls to this
251  // functions cumulate their deltas.
252  const auto delta = std::fabs(td) / q.tag.size();
253  for (const auto s : q.tag)
254  deltaStorage_[s] += delta;
255  }
256  }
257 
258  template <typename M, typename Maximizer>
259  void CooperativePrioritizedSweeping<M, Maximizer>::addToQueue(const State & s1) {
260  // Note that s1 was s before, but here we consider it as the
261  // "future" state as we look for its parents.
262  const auto & T = model_.getTransitionFunction();
263  const auto & graph = model_.getGraph();
264 
265  for (size_t i = 0; i < s1.size(); ++i) {
266  // If the delta to apply is very small, we don't bother with it yet.
267  // This allows us to save some work until it's actually worth it.
268  if (deltaStorage_[i] < queue_.getNodeMaxPriority(i)) continue;
269 
270  // Here we need to iterate over j, but the queue still needs the
271  // a,s variables. So we keep all of them in mind to keep things easy.
272  size_t j = 0;
273  for (size_t a = 0; a < graph.getPartialSize(i); ++a) {
274  for (size_t s = 0; s < graph.getPartialSize(i, a); ++s) {
275  const auto p = T.transitions[i](j, s1[i]) * deltaStorage_[i];
276 
277  // Increase j before we check if we want to skip.
278  ++j;
279  // If it's not large enough, skip it.
280  if (p < theta_) continue;
281 
282  queue_.update(i, a, s, p);
283  }
284  }
285  // Reset this delta.
286  deltaStorage_[i] = 0.0;
287  }
288  }
289 
290  template <typename M, typename Maximizer>
292  for (auto & q : q_.bases)
293  q.values.fill(val);
294 
295  // Add some noise to avoid non-unique maximum with MaxPlus since it cannot handle them.
296  if constexpr(std::is_same_v<Maximizer, Bandit::MaxPlus>) {
297  std::uniform_real_distribution<double> dist(-0.01 * val, 0.01 * val);
298  for (auto & q : q_.bases)
299  q.values += Matrix2D::NullaryExpr(q.values.rows(), q.values.cols(), [&](){return dist(rand_);});
300  }
301  }
302 
303  template <typename M, typename Maximizer>
305  return q_;
306  }
307 
308  template <typename M, typename Maximizer>
310  return gp_;
311  }
312 
313  template <typename M, typename Maximizer>
315  return gp_;
316  }
317 }
318 
319 #endif
AIToolbox::Factored::MDP::QGreedyPolicy
This class implements a greedy policy through a QFunction.
Definition: QGreedyPolicy.hpp:23
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping
This class implements PrioritizedSweeping for cooperative environments.
Definition: CooperativePrioritizedSweeping.hpp:33
AIToolbox::Seeder
This class is an internal class used to seed all random engines in the library.
Definition: Seeder.hpp:15
AIToolbox::Factored::Rewards
Vector Rewards
Definition: Types.hpp:71
QGreedyPolicy.hpp
AIToolbox::Factored::MDP::makeQFunction
QFunction makeQFunction(const DDNGraph &graph, const std::vector< std::vector< size_t >> &basisDomains)
This function creates a new factored QFunction from the given graph and basis domain.
AIToolbox::Factored::MDP
Definition: CooperativePrioritizedSweeping.hpp:13
AIToolbox::Factored::State
Factors State
Definition: Types.hpp:67
AIToolbox::Factored::FactoredMatrix2D
This class represents a factored 2D matrix.
Definition: FactoredMatrix.hpp:140
AIToolbox::Factored::toIndexPartial
size_t toIndexPartial(const PartialKeys &ids, const Factors &space, const Factors &f)
This function converts the input factor in the input space to an unique index.
CPSQueue.hpp
AIToolbox::Vector
Eigen::Matrix< double, Eigen::Dynamic, 1 > Vector
Definition: Types.hpp:16
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping::getInternalQGreedyPolicy
QGreedyPolicy< Maximizer > & getInternalQGreedyPolicy()
This function returns the QGreedyPolicy we use to determine a1* in the updates.
Definition: CooperativePrioritizedSweeping.hpp:309
FactoredMatrix.hpp
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping::batchUpdateQ
void batchUpdateQ(const unsigned N=50)
This function performs a series of batch updates using the model to sample.
Definition: CooperativePrioritizedSweeping.hpp:159
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping::stepUpdateQ
void stepUpdateQ(const State &s, const Action &a, const State &s1, const Rewards &r)
This function performs a single update of the Q-Function with the input data.
Definition: CooperativePrioritizedSweeping.hpp:153
Core.hpp
Utils.hpp
Seeder.hpp
AIToolbox::RandomEngine
std::mt19937 RandomEngine
Definition: Types.hpp:14
AIToolbox::Factored::CPSQueue
This class is used as the priority queue for CooperativePrioritizedSweeping.
Definition: CPSQueue.hpp:25
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping::CooperativePrioritizedSweeping
CooperativePrioritizedSweeping(const M &m, std::vector< std::vector< size_t >> basisDomains, double alpha=0.3, double theta=0.001)
Basic constructor.
Definition: CooperativePrioritizedSweeping.hpp:125
AIToolbox::Factored::Action
Factors Action
Definition: Types.hpp:69
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping::setQFunction
void setQFunction(double val)
This function sets the QFunction to a set value.
Definition: CooperativePrioritizedSweeping.hpp:291
FasterTrie.hpp
AIToolbox::Factored::FactoredMatrix2D::bases
std::vector< BasisMatrix > bases
Definition: FactoredMatrix.hpp:195
Types.hpp
AIToolbox::Factored::MDP::CooperativePrioritizedSweeping::getQFunction
const QFunction & getQFunction() const
This function returns a reference to the internal QFunction.
Definition: CooperativePrioritizedSweeping.hpp:304