AI-Toolbox/rPOMCP_8hpp_source.html

#ifndef AI_TOOLBOX_POMDP_rPOMCP_HEADER_FILE

#define AI_TOOLBOX_POMDP_rPOMCP_HEADER_FILE


#include <unordered_map>


#include <AIToolbox/Logging.hpp>

#include <AIToolbox/Seeder.hpp>

#include <AIToolbox/Utils/Probability.hpp>

#include <AIToolbox/POMDP/Types.hpp>

#include <AIToolbox/POMDP/TypeTraits.hpp>


#include <AIToolbox/POMDP/Algorithms/Utils/rPOMCPGraph.hpp>


namespace AIToolbox::POMDP {

    template <IsGenerativeModel M, bool UseEntropy>

    class rPOMCP {

        public:

            // Shorthands to avoid specifying UseEntropy everywhere.

            using BNode = BeliefNode<UseEntropy>;

            using ANode = ActionNode<UseEntropy>;

            using HNode = HeadBeliefNode<UseEntropy>;


            rPOMCP(const M& m, size_t beliefSize, unsigned iterations, double exp, unsigned k = 500);


            size_t sampleAction(const Belief& b, unsigned horizon);


            size_t sampleAction(size_t a, size_t o, unsigned horizon);


            void setBeliefSize(size_t beliefSize);


            void setIterations(unsigned iter);


            void setExploration(double exp);


            const M& getModel() const;


            const HNode& getGraph() const;


            size_t getBeliefSize() const;


            unsigned getIterations() const;


            double getExploration() const;


        private:

            const M& model_;

            size_t S, A, beliefSize_;

            unsigned iterations_, maxDepth_;

            double exploration_;

            unsigned k_;


            mutable RandomEngine rand_;


            HNode graph_;


            // Private Methods

            size_t runSimulation(unsigned horizon);

            double simulate(BNode & b, size_t s, unsigned horizon);


            void maxBeliefNodeUpdate(BNode * bn, const ANode & aNode, size_t a);


            template <typename Iterator>

            Iterator findBestA(Iterator begin, Iterator end);


            template <typename Iterator>

            Iterator findBestBonusA(Iterator begin, Iterator end, unsigned count);

    };


    template <IsGenerativeModel M, bool UseEntropy>

    rPOMCP<M, UseEntropy>::rPOMCP(const M& m, const size_t beliefSize, const unsigned iter, const double exp, const unsigned k) : model_(m), S(model_.getS()), A(model_.getA()),

        beliefSize_(beliefSize), iterations_(iter),

        exploration_(exp), k_(k),

        rand_(AIToolbox::Seeder::getSeed()), graph_(A, rand_) {}


    template <IsGenerativeModel M, bool UseEntropy>

    size_t rPOMCP<M, UseEntropy>::sampleAction(const Belief& b, const unsigned horizon) {

        // Reset graph

        graph_ = HNode(A, beliefSize_, b, rand_);


        return runSimulation(horizon);

    }


    template <IsGenerativeModel M, bool UseEntropy>

    size_t rPOMCP<M, UseEntropy>::sampleAction(const size_t a, const size_t o, const unsigned horizon) {

        auto & obs = graph_.children[a].children;


        auto it = obs.find(o);

        if ( it == obs.end() ) {

            AI_LOGGER(AI_SEVERITY_WARNING, "Observation " << o << " never experienced in simulation, restarting with uniform belief..");

            return sampleAction(Belief(S, 1.0 / S), horizon);

        }


        // Here we need an additional step, because *it is contained by graph_.

        // If we just move assign, graph_ is first going to delete everything it

        // contains (included *it), and then we are going to move unallocated memory

        // into graph_! So we move *it outside of the graph_ hierarchy, so that

        // we can then assign safely.

        { BNode tmp = std::move(it->second); graph_ = HNode(A, std::move(tmp), rand_); }


        if ( graph_.isSampleBeliefEmpty() ) {

            AI_LOGGER(AI_SEVERITY_WARNING, "rPOMCP lost track of the belief, restarting with uniform..");

            return sampleAction(Belief(S, 1.0 / S), horizon);

        }


        return runSimulation(horizon);

    }


    template <IsGenerativeModel M, bool UseEntropy>

    size_t rPOMCP<M, UseEntropy>::runSimulation(const unsigned horizon) {

        if ( !horizon ) return 0;


        maxDepth_ = horizon;


        for (unsigned i = 0; i < iterations_; ++i )

            simulate(graph_, graph_.sampleBelief(), 0);


        auto begin = std::begin(graph_.children);

        size_t bestA = std::distance(begin, findBestA(begin, std::end(graph_.children)));


        // Since we do not update the root value in simulate,

        // we do it here.

        graph_.V = graph_.children[bestA].V;

        return bestA;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    double rPOMCP<M, UseEntropy>::simulate(BNode & b, size_t s, unsigned depth) {

        b.N++;


        // Select next action node

        auto begin = std::begin(b.children);

        size_t a = std::distance(begin, findBestBonusA(begin, std::end(b.children), b.N));

        auto & aNode = b.children[a];


        // Generate next step

        size_t s1, o;

        std::tie(s1, o, std::ignore) = model_.sampleSOR(s, a);


        double immAndFutureRew = 0.0;

        {

            typename decltype(aNode.children)::iterator ot;

            bool newNode = false;


            // This either adds a node or sets ot to the existing node.

            ot = aNode.children.find(o);

            if ( ot == aNode.children.end() ) {

                newNode = true;

                std::tie(ot, std::ignore) = aNode.children.insert(std::make_pair(o, BNode()));

            }


            // Compute knowledge for new observation node (entropy/max belief)

            // This needs to be done here since we are going to upgrade a future belief.

            ot->second.updateBeliefAndKnowledge(s1);


            // We only go deeper if needed (maxDepth_ is always at least 1).

            if ( depth + 1 < maxDepth_ && !model_.isTerminal(s1) && !newNode) {

                ot->second.children.resize(A);

                immAndFutureRew = simulate( ot->second, s1, depth + 1 );

            }

            // Otherwise we increase the N for the bottom leaves, since they can't get it otherwise and is needed for entropy

            else {

                ot->second.N += 1;

                // For leaves we still extract entropy

                if ( depth + 1 >= maxDepth_ )

                    immAndFutureRew = ot->second.getKnowledgeMeasure();

            }

        }


        // Action update

        aNode.N += 1;

        aNode.V += ( immAndFutureRew - aNode.V ) / static_cast<double>(aNode.N);


        // At this point the current beliefNode has a correct estimate of its

        // own entropy. What it needs to do is select its best action. Although

        // this is not needed for the top node.

        if ( depth == 0 ) return 0.0;


        // Here we decide what to transmit to the upper level. In case this

        // node has not been explored enough, then we simply pass on the new

        // datapoint. Otherwise we compute the max over the actions, and we

        // transmit a fake datapoint that will modify the value of the action

        // above as if we chose the best action all the time in the past.

        if ( b.N >= k_ ) {

            // Force looking out for best action

            if ( b.N == k_ ) {

                b.actionsV = HUGE_VAL;

                b.bestAction = a;

            }

            maxBeliefNodeUpdate(&b, aNode, a);

        }

        else {

            b.actionsV += ( immAndFutureRew - b.actionsV ) / static_cast<double>(b.N);

        }


        double oldV = b.V;

        // Note that both actionsV and entropy have been modified from last time!

        // We discount the action part since it's the future reward part, while the

        // immediate reward is the direct entropy, which is not discounted.

        b.V = model_.getDiscount() * b.actionsV + b.getKnowledgeMeasure();

        // This replaces our old value with the new value in the action update.

        return (b.N - 1)*(b.V - oldV) + b.V;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    void rPOMCP<M, UseEntropy>::maxBeliefNodeUpdate(BNode * bp, const ANode & aNode, const size_t a) {

        auto & b = *bp;


        if ( aNode.V >= b.actionsV ) {

            b.actionsV   = aNode.V;

            b.bestAction = a;

        }

        // Note: This is needed because the value may go down!

        else if ( a == b.bestAction ) {

            auto begin = std::begin(b.children);

            auto it = findBestA(begin, std::end(b.children));

            b.actionsV   = it->V;

            b.bestAction = std::distance(begin, it);

        }

    }


    template <IsGenerativeModel M, bool UseEntropy>

    template <typename Iterator>

    Iterator rPOMCP<M, UseEntropy>::findBestA(const Iterator begin, const Iterator end) {

        return std::max_element(begin, end, [](const ANode & lhs, const ANode & rhs){ return lhs.V < rhs.V; });

    }


    template <IsGenerativeModel M, bool UseEntropy>

    template <typename Iterator>

    Iterator rPOMCP<M, UseEntropy>::findBestBonusA(Iterator begin, const Iterator end, const unsigned count) {

        // Count here can be as low as 1.

        // Since log(1) = 0, and 0/0 = error, we add 1.0.

        double logCount = std::log(count + 1.0);

        // We use this function to produce a score for each action. This can be easily

        // substituted with something else to produce different rPOMCP variants.

        auto evaluationFunction = [this, logCount](const ANode & an){

            return an.V + exploration_ * std::sqrt( logCount / an.N );

        };


        auto bestIterator = begin++;

        double bestValue = evaluationFunction(*bestIterator);


        for ( ; begin < end; ++begin ) {

            double actionValue = evaluationFunction(*begin);

            if ( actionValue > bestValue ) {

                bestValue = actionValue;

                bestIterator = begin;

            }

        }


        return bestIterator;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    void rPOMCP<M, UseEntropy>::setBeliefSize(const size_t beliefSize) {

        beliefSize_ = beliefSize;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    void rPOMCP<M, UseEntropy>::setIterations(const unsigned iter) {

        iterations_ = iter;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    void rPOMCP<M, UseEntropy>::setExploration(const double exp) {

        exploration_ = exp;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    const M& rPOMCP<M, UseEntropy>::getModel() const {

        return model_;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    const HeadBeliefNode<UseEntropy>& rPOMCP<M, UseEntropy>::getGraph() const {

        return graph_;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    size_t rPOMCP<M, UseEntropy>::getBeliefSize() const {

        return beliefSize_;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    unsigned rPOMCP<M, UseEntropy>::getIterations() const {

        return iterations_;

    }


    template <IsGenerativeModel M, bool UseEntropy>

    double rPOMCP<M, UseEntropy>::getExploration() const {

        return exploration_;

    }

}


#endif