AI-Toolbox/FastInformedBound_8hpp_source.html

#ifndef AI_TOOLBOX_POMDP_FAST_INFORMED_BOUND_HEADER_FILE

#define AI_TOOLBOX_POMDP_FAST_INFORMED_BOUND_HEADER_FILE


#include <AIToolbox/Utils/Core.hpp>


#include <AIToolbox/MDP/Utils.hpp>

#include <AIToolbox/POMDP/Types.hpp>

#include <AIToolbox/POMDP/TypeTraits.hpp>

#include <AIToolbox/POMDP/Utils.hpp>


namespace AIToolbox::POMDP {

    class FastInformedBound {

        public:

            FastInformedBound(unsigned horizon, double tolerance = 0.001);


            template <IsModel M>

            std::tuple<double, MDP::QFunction> operator()(const M & m, const MDP::QFunction & oldQ = {});


            template <IsModel M, typename SOSA>

            std::tuple<double, MDP::QFunction> operator()(const M & m, const SOSA & sosa, MDP::QFunction oldQ = {});


            void setTolerance(double tolerance);


            void setHorizon(unsigned h);


            double getTolerance() const;


            unsigned getHorizon() const;


        private:

            size_t horizon_;

            double tolerance_;

    };


    template <IsModel M>

    std::tuple<double, MDP::QFunction> FastInformedBound::operator()(const M & m, const MDP::QFunction & oldQ) {

        return operator()(m, makeSOSA(m), oldQ);

    }


    template <IsModel M, typename SOSA>

    std::tuple<double, MDP::QFunction> FastInformedBound::operator()(const M & m, const SOSA & sosa, MDP::QFunction oldQ) {

        const auto & ir = [&]{

            if constexpr (IsModelEigen<M>) return m.getRewardFunction();

            else return computeImmediateRewards(m);

        }();

        auto newQ = MDP::QFunction(m.getS(), m.getA());


        if (oldQ.size() == 0) {

            oldQ.resize(m.getS(), m.getA());


            double max;

            using Tmp = std::remove_cvref_t<decltype(ir)>;

            if constexpr(std::is_base_of_v<Eigen::SparseMatrixBase<Tmp>, Tmp>)

                max = Eigen::Map<const Vector>(ir.valuePtr(), ir.size()).maxCoeff();

            else

                max = ir.maxCoeff();


            // Note that here we take the max over all IR: since we're

            // computing an upper bound, we want to assume that we're going to

            // do the best possible thing after each action forever.

            oldQ.fill(max / std::max(0.0001, 1.0 - m.getDiscount()));

        }


        unsigned timestep = 0;

        const bool useTolerance = checkDifferentSmall(tolerance_, 0.0);

        double variation = tolerance_ * 2; // Make it bigger

        while ( timestep < horizon_ && ( !useTolerance || variation > tolerance_ ) ) {

            ++timestep;

            newQ.setZero();

            // Q(s,a) = R(s,a) + gamma * Sum_o max_a' Sum_s' P(s',o|s,a) * Q(s',a')

            for (size_t a = 0; a < m.getA(); ++a)

                for (size_t o = 0; o < m.getO(); ++o)

                    newQ.col(a) += (sosa[a][o] * oldQ).rowwise().maxCoeff();

            newQ *= m.getDiscount();

            newQ += ir;


            if (useTolerance)

                variation = (oldQ - newQ).cwiseAbs().maxCoeff();


            std::swap(oldQ, newQ);

        }

        return std::make_tuple(useTolerance ? variation : 0.0, std::move(oldQ));

    }

}


#endif