1 #ifndef AI_TOOLBOX_POMDP_SARSOP_HEADER_FILE
2 #define AI_TOOLBOX_POMDP_SARSOP_HEADER_FILE
41 SARSOP(
double tolerance,
double delta = 0.1);
87 std::tuple<double, double, VList, MDP::QFunction>
operator()(
const M & model,
const Belief & initialBelief);
117 Eigen::Matrix<double, 3, Eigen::Dynamic, Eigen::RowMajor | Eigen::AutoAlign> actionData;
125 boost::multi_array<Children, 2> children;
169 size_t id,
const M & model,
201 TreeNode & node,
const M & model,
225 size_t id,
const M & model,
230 double predictValue(
size_t id,
const TreeNode & node);
231 void deltaPrune(
VList & lbV);
232 void deltaUpdate(
const VList & lbV);
233 void updateSubOptimalPaths(TreeNode & root);
234 void treePrune(TreeNode & root);
235 void treeRevive(TreeNode & root);
249 LBPredictor(
size_t entropyBins,
size_t UBBins,
const MDP::QFunction & ubQ);
269 std::pair<double, double> predict(
size_t id,
const TreeNode & node);
289 const Bin & update(
size_t id,
const TreeNode & node);
292 size_t entropyBins_, UBBins_;
293 double entropyStep_, UBMin_, UBStep_;
296 std::unordered_map<size_t, std::tuple<bool, size_t, size_t, double, double>> nodes_;
298 boost::multi_array<Bin, 2> bins_;
301 double tolerance_, initialDelta_;
306 std::vector<TreeNode> treeStorage_;
314 std::unordered_map<Belief, size_t, boost::hash<Belief>> beliefToNode_;
315 std::vector<LBPredictor> predictors_;
318 std::vector<size_t> sampledNodes_;
319 std::vector<char> backuppedActions_;
320 Belief intermediateBeliefTmp_, nextBeliefTmp_;
325 constexpr
unsigned infiniteHorizon = 1000000;
327 AI_LOGGER(
AI_SEVERITY_DEBUG,
"Running SARSOP; POMDP S: " << pomdp.getS() <<
"; A: " << pomdp.getA() <<
"; O: " << pomdp.getO());
335 delta_ = initialDelta_;
338 if constexpr (!MDP::IsModelEigen<M>)
342 treeStorage_.clear();
343 treeStorage_.reserve(pomdp.getA() * pomdp.getO() + 1);
345 beliefToNode_.clear();
357 backuppedActions_.resize(pomdp.getA());
358 intermediateBeliefTmp_.resize(pomdp.getS());
359 nextBeliefTmp_.resize(pomdp.getS());
376 VList lbVList = std::get<1>(bs(pomdp,
true));
418 for (
auto & ve : lbVList)
419 ve.observations.push_back(1);
423 for (
size_t s = 0; s < pomdp.getS(); ++s) {
426 ++it->observations[0];
429 it->observations.push_back(s);
433 ++it->observations[0];
435 it->observations.push_back(pomdp.getS());
447 {initialBelief}, {(initialBelief.transpose() * ubQ).maxCoeff()}
456 const auto initialUbQ = ubQ;
458 constexpr
unsigned numBins = 2;
459 constexpr
unsigned entropyBins = 5;
460 constexpr
unsigned ubBins = 5;
461 constexpr
unsigned binScaling = 2;
463 for (
unsigned i = 0; i < numBins; ++i) {
464 const unsigned scaling = std::pow(binScaling, i);
466 predictors_.emplace_back(entropyBins * scaling, ubBins * scaling, initialUbQ);
473 treeStorage_.emplace_back();
477 treeStorage_[0].belief = initialBelief;
478 treeStorage_[0].count = 1;
479 updateNode(treeStorage_[0], pomdp, lbVList, ubQ, ubV,
false);
492 samplePoints(pomdp, lbVList, ubQ, ubV);
496 if (sampledNodes_.size() == 0) {
505 for (
auto rIt = std::rbegin(sampledNodes_); rIt != std::rend(sampledNodes_); ++rIt)
506 backupNode(*rIt, pomdp, lbVList, ubQ, ubV);
524 size_t i = ubV.first.size();
531 std::swap(ubV.first[i], ubV.first.back());
532 std::swap(ubV.second[i], ubV.second.back());
534 auto belief = std::move(ubV.first.back());
535 auto value = ubV.second.back();
537 ubV.first.pop_back();
538 ubV.second.pop_back();
544 ubV.first.emplace_back(std::move(belief));
545 ubV.second.emplace_back(value);
547 }
while (i != 0 && ubV.first.size() > 1);
550 "Root lower bound: " << treeStorage_[0].LB <<
551 "; upper bound: " << treeStorage_[0].UB <<
552 "; alpha vectors: " << lbVList.size() <<
553 "; belief points: " << ubV.first.size());
555 if (treeStorage_[0].UB - treeStorage_[0].LB <= tolerance_)
561 for (
auto & ventry : lbVList)
562 ventry.observations.clear();
564 return std::make_tuple(treeStorage_[0].LB, treeStorage_[0].UB, lbVList, ubQ);
569 sampledNodes_.clear();
573 size_t currentNodeId = 0;
574 const double rootGap = (treeStorage_[0].UB - treeStorage_[0].LB) * 0.95;
577 double L = treeStorage_[0].LB;
578 double U = L + rootGap;
582 const double targetGap = rootGap * std::pow(pomdp.getDiscount(), -depth);
588 const TreeNode & node = treeStorage_[currentNodeId];
590 const double finalExcess = node.UB - node.LB - 0.5 * targetGap;
591 if (finalExcess <= 0.0)
596 const auto vHat = predictValue(currentNodeId, node);
597 if (vHat <= L && node.UB <= std::max(U, node.LB + targetGap))
603 sampledNodes_.push_back(currentNodeId);
606 if (treeStorage_[currentNodeId].children.size() == 0)
607 expandLeaf(currentNodeId, pomdp, lbVList, ubQ, ubV);
610 const TreeNode & node = treeStorage_[currentNodeId];
613 const auto L1 = std::max(L, node.LB);
614 const auto U1 = std::max(U, node.LB + targetGap);
617 const auto a1 = node.actionUb;
621 const double nextDepthGap = targetGap / pomdp.getDiscount();
622 double maxVal = std::numeric_limits<double>::lowest();
623 for (
size_t o = 0; o < pomdp.getO(); ++o) {
624 if (node.children[a1][o].observationProbability == 0.0)
continue;
626 const auto & childNode = treeStorage_[node.children[a1][o].id];
627 const auto val = (childNode.UB - childNode.LB - nextDepthGap) * node.children[a1][o].observationProbability;
635 double Lnorm = 0.0, Unorm = 0.0;
636 for (
size_t o = 0; o < pomdp.getO(); ++o) {
637 if (o == o1)
continue;
639 const auto & childNode = treeStorage_[node.children[a1][o].id];
641 Lnorm += childNode.LB * node.children[a1][o].observationProbability;
642 Unorm += childNode.UB * node.children[a1][o].observationProbability;
646 L = ((L1 - node.actionData(0, a1)) / pomdp.getDiscount() - Lnorm) / node.children[a1][o1].observationProbability;
647 U = ((U1 - node.actionData(0, a1)) / pomdp.getDiscount() - Unorm) / node.children[a1][o1].observationProbability;
650 currentNodeId = node.children[a1][o1].id;
657 void SARSOP::expandLeaf(
658 const size_t id,
const M & pomdp,
659 const VList & lbVList,
667 TreeNode * nodep = &treeStorage_[id];
669 assert(nodep->children.size() == 0);
674 assert(nodep->count > 0);
677 updateNode(*nodep, pomdp, lbVList, ubQ, ubV,
true);
680 nodep->children.resize(boost::extents[pomdp.getA()][pomdp.getO()]);
682 for (
size_t a = 0; a < pomdp.getA(); ++a) {
685 for (
size_t o = 0; o < pomdp.getO(); ++o) {
686 auto & child = nodep->children[a][o];
690 const auto prob = nextBeliefTmp_.sum();
697 nextBeliefTmp_ /= prob;
699 child.observationProbability = prob;
701 const auto it = beliefToNode_.find(nextBeliefTmp_);
702 if (it != beliefToNode_.end()) {
705 child.id = it->second;
706 if (++treeStorage_[child.id].count == 1) {
717 treeRevive(treeStorage_[child.id]);
724 child.id = treeStorage_.size();
725 beliefToNode_[nextBeliefTmp_] = child.id;
730 treeStorage_.emplace_back();
732 nodep = &treeStorage_[id];
734 auto & childNode = treeStorage_.back();
736 childNode.belief = nextBeliefTmp_;
739 updateNode(childNode, pomdp, lbVList, ubQ, ubV,
false);
745 void SARSOP::updateNode(
746 TreeNode & node,
const M & pomdp,
747 const VList & lbVList,
752 const auto & ir = [&]{
753 if constexpr (MDP::IsModelEigen<M>)
return pomdp.getRewardFunction();
754 else return immediateRewards_;
759 const auto ub = bestPromisingAction<false>(pomdp, ir, node.belief, ubQ, ubV, &ubs);
760 node.UB = std::get<1>(ub);
761 node.actionUb = std::get<0>(ub);
767 node.actionData.resize(Eigen::NoChange, pomdp.getA());
768 node.actionData.row(0) = node.belief.transpose() * ir;
769 node.actionData.row(1) = ubs;
770 node.actionData.row(2).fill(0);
775 node.LB = std::get<1>(lb);
781 const auto & ir = [&]{
782 if constexpr (MDP::IsModelEigen<M>)
return pomdp.getRewardFunction();
783 else return immediateRewards_;
786 TreeNode & node = treeStorage_[id];
791 node.LB = std::get<1>(result);
793 lbVList.emplace_back(std::move(alpha), std::get<0>(result),
VObs{1,
id + pomdp.getS()});
801 std::fill(std::begin(backuppedActions_), std::end(backuppedActions_),
false);
802 auto maxAction = node.actionUb;
804 while (!backuppedActions_[maxAction]) {
806 for (
size_t o = 0; o < pomdp.getO(); ++o) {
807 const double obsP = node.children[maxAction][o].observationProbability;
809 if (obsP == 0.0)
continue;
811 const auto & childNode = treeStorage_[node.children[maxAction][o].id];
815 sum = node.actionData(0, maxAction) + pomdp.getDiscount() * sum;
817 node.actionData(1, maxAction) = sum;
818 backuppedActions_[maxAction] =
true;
820 node.UB = node.actionData.row(1).maxCoeff(&maxAction);
822 node.actionUb = maxAction;
827 for (
size_t s = 0; s < pomdp.getS(); ++s) {
829 ubQ(s, maxAction) = node.UB;
833 ubV.first.push_back(node.belief);
834 ubV.second.push_back(node.UB);