AI-Toolbox/BlindStrategies_8hpp_source.html

#ifndef AI_TOOLBOX_POMDP_BLIND_STRATEGIES_HEADER_FILE

#define AI_TOOLBOX_POMDP_BLIND_STRATEGIES_HEADER_FILE


#include <AIToolbox/Utils/Core.hpp>

#include <AIToolbox/Utils/Prune.hpp>


#include <AIToolbox/MDP/Utils.hpp>

#include <AIToolbox/POMDP/Types.hpp>

#include <AIToolbox/POMDP/TypeTraits.hpp>


namespace AIToolbox::POMDP {

    class BlindStrategies {

        public:

            BlindStrategies(unsigned horizon, double tolerance = 0.001);


            template <IsModel M>

            std::tuple<double, VList> operator()(const M & m, bool fasterConvergence);


            void setTolerance(double tolerance);


            void setHorizon(unsigned h);


            double getTolerance() const;


            unsigned getHorizon() const;


        private:

            size_t horizon_;

            double tolerance_;

    };


    template <IsModel M>

    std::tuple<double, VList> BlindStrategies::operator()(const M & m, const bool fasterConvergence) {

        const MDP::QFunction ir = [&]{

            if constexpr(MDP::IsModelEigen<M>) return m.getRewardFunction().transpose();

            else return MDP::computeImmediateRewards(m).transpose();

        }();

        // This function produces a very simple lower bound for the POMDP. The

        // bound for each action is computed assuming to take the same action forever

        // (so the bound for action 0 assumes to forever take action 0, the bound for

        // action 1 assumes to take action 1, etc.).

        VList retval;


        const bool useTolerance = checkDifferentSmall(tolerance_, 0.0);


        double maxVariation = 0.0;

        for (size_t a = 0; a < m.getA(); ++a) {

            auto newAlpha = Vector(m.getS());

            auto oldAlpha = Vector(m.getS());

            // Note that here we can take the minimum for each action

            // separately, since the implied policy will take that action

            // forever anyway so there cannot be "cross-pollination" between

            // different actions.

            if (fasterConvergence)

                oldAlpha.fill(ir.row(a).minCoeff() / std::max(0.0001, 1.0 - m.getDiscount()));

            else

                oldAlpha = ir.row(a);


            unsigned timestep = 0;

            double variation = tolerance_ * 2; // Make it bigger

            while ( timestep < horizon_ && ( !useTolerance || variation > tolerance_ ) ) {

                ++timestep;

                if constexpr(MDP::IsModelEigen<M>) {

                    newAlpha = ir.row(a) + (m.getDiscount() * m.getTransitionFunction(a) * oldAlpha).transpose();

                } else {

                    newAlpha = ir.row(a);

                    for (size_t s = 0; s < m.getS(); ++s) {

                        double sum = 0.0;

                        for (size_t s1 = 0; s1 < m.getS(); ++s1)

                            sum += m.getTransitionProbability(s, a, s1) * oldAlpha[s1];

                        newAlpha[s] += m.getDiscount() * sum;

                    }

                }


                if (useTolerance)

                    variation = (oldAlpha - newAlpha).cwiseAbs().maxCoeff();


                oldAlpha = std::move(newAlpha);

            }

            maxVariation = std::max(maxVariation, variation);

            retval.emplace_back(std::move(oldAlpha), a, VObs(0));

        }

        return std::make_tuple(useTolerance ? maxVariation : 0.0, std::move(retval));

    }

}


#endif