1 #ifndef AI_TOOLBOX_FACTORED_MDP_COOPERATIVE_PRIORITIZED_SWEEPING_HEADER_FILE
2 #define AI_TOOLBOX_FACTORED_MDP_COOPERATIVE_PRIORITIZED_SWEEPING_HEADER_FILE
32 template <
typename M,
typename Maximizer = Bandit::VariableElimination>
109 void addToQueue(
const State & s1);
112 double alpha_, theta_;
114 std::vector<std::vector<size_t>> qDomains_;
115 Vector rewardWeights_, deltaStorage_, rewardStorage_;
124 template <
typename M,
typename Maximizer>
127 alpha_(alpha), theta_(theta),
128 qDomains_(std::move(basisDomains)),
129 rewardWeights_(model_.getS().size()),
130 deltaStorage_(model_.getS().size()),
131 rewardStorage_(model_.getS().size()),
133 gp_(model_.getS(), model_.getA(), q_),
134 queue_(model_.getGraph()),
142 rewardWeights_.setZero();
143 deltaStorage_.setZero();
147 for (
const auto & q : q_.
bases)
148 for (
const auto d : q.tag)
149 rewardWeights_[d] += 1.0;
152 template <
typename M,
typename Maximizer>
154 updateQ(s, a, s1, r);
158 template <
typename M,
typename Maximizer>
161 State s(model_.getS().size());
162 State s1(model_.getS().size());
163 Action a(model_.getA().size());
164 Rewards rews(model_.getS().size());
166 for (
size_t n = 0; n < N; ++n) {
167 if (!queue_.getNonZeroPriorities())
return;
169 queue_.reconstruct(s, a);
172 for (
size_t i = 0; i < s.size(); ++i) {
173 if (s[i] == model_.getS()[i]) {
174 std::uniform_int_distribution<size_t> dist(0, model_.getS()[i]-1);
178 for (
size_t i = 0; i < a.size(); ++i) {
179 if (a[i] == model_.getA()[i]) {
180 std::uniform_int_distribution<size_t> dist(0, model_.getA()[i]-1);
186 model_.sampleSRs(s, a, &s1, &rews);
189 updateQ(s, a, s1, rews);
196 template <
typename M,
typename Maximizer>
199 const auto a1 = gp_.sampleAction(s1);
209 rewardStorage_ = r.array();
211 for (
const auto & q : q_.bases) {
221 const auto diff = (model_.getDiscount() * q.values(s1id, a1id) - q.values(sid, aid)) / q.tag.size();
225 for (
const auto s : q.tag)
226 rewardStorage_[s] += diff;
230 rewardStorage_.array() /= rewardWeights_.array();
231 rewardStorage_.array() *= alpha_;
234 for (
size_t i = 0; i < q_.bases.size(); ++i) {
235 auto & q = q_.bases[i];
243 for (
const auto s : q.tag)
244 td += rewardStorage_[s];
246 q.values(sid, aid) += td;
252 const auto delta = std::fabs(td) / q.tag.size();
253 for (
const auto s : q.tag)
254 deltaStorage_[s] += delta;
258 template <
typename M,
typename Maximizer>
259 void CooperativePrioritizedSweeping<M, Maximizer>::addToQueue(
const State & s1) {
262 const auto & T = model_.getTransitionFunction();
263 const auto & graph = model_.getGraph();
265 for (
size_t i = 0; i < s1.size(); ++i) {
268 if (deltaStorage_[i] < queue_.getNodeMaxPriority(i))
continue;
273 for (
size_t a = 0; a < graph.getPartialSize(i); ++a) {
274 for (
size_t s = 0; s < graph.getPartialSize(i, a); ++s) {
275 const auto p = T.transitions[i](j, s1[i]) * deltaStorage_[i];
280 if (p < theta_)
continue;
282 queue_.update(i, a, s, p);
286 deltaStorage_[i] = 0.0;
290 template <
typename M,
typename Maximizer>
292 for (
auto & q : q_.bases)
296 if constexpr(std::is_same_v<Maximizer, Bandit::MaxPlus>) {
297 std::uniform_real_distribution<double> dist(-0.01 * val, 0.01 * val);
298 for (
auto & q : q_.bases)
299 q.values += Matrix2D::NullaryExpr(q.values.rows(), q.values.cols(), [&](){return dist(rand_);});
303 template <
typename M,
typename Maximizer>
308 template <
typename M,
typename Maximizer>
313 template <
typename M,
typename Maximizer>