AI-Toolbox/PolicyEvaluation_8hpp_source.html

#ifndef AI_TOOLBOX_MDP_POLICY_EVALUATION_HEADER_FILE

#define AI_TOOLBOX_MDP_POLICY_EVALUATION_HEADER_FILE


#include <tuple>

#include <iterator>


#include <AIToolbox/Logging.hpp>

#include <AIToolbox/MDP/Types.hpp>

#include <AIToolbox/MDP/TypeTraits.hpp>

#include <AIToolbox/MDP/Utils.hpp>

#include <AIToolbox/Utils/Probability.hpp>

#include <AIToolbox/MDP/Policies/PolicyInterface.hpp>


namespace AIToolbox::MDP {

    template <IsModel M>

    class PolicyEvaluation {

        public:

            PolicyEvaluation(const M & m, unsigned horizon, double tolerance = 0.001, Values v = Values());


            std::tuple<double, Values, QFunction> operator()(const PolicyInterface & p);


            void setTolerance(double e);


            void setHorizon(unsigned h);


            void setValues(Values v);


            double getTolerance() const;


            unsigned getHorizon() const;


            const Values & getValues() const;


        private:

            // Parameters

            double tolerance_;

            unsigned horizon_;

            Values vParameter_;

            const M & model_;


            // Internals

            QFunction immediateRewards_;

            Values v1_;

            size_t S, A;

    };


    template <IsModel M>

    PolicyEvaluation<M>::PolicyEvaluation(const M & m, const unsigned horizon, const double tolerance, Values v) :

            horizon_(horizon), vParameter_(std::move(v)), model_(m), S(0), A(0)

    {

        setTolerance(tolerance);


        // Extract necessary knowledge from model so we don't have to pass it around

        S = model_.getS();

        A = model_.getA();


        // Only compute the immediate rewards if we need them.

        if constexpr (!IsModelEigen<M>)

            immediateRewards_ = computeImmediateRewards(m);

    }


    template <IsModel M>

    std::tuple<double, Values, QFunction> PolicyEvaluation<M>::operator()(const PolicyInterface & policy) {

        {

            // Verify that parameter value function is compatible.

            const size_t size = vParameter_.size();

            if ( size != S ) {

                if ( size != 0 ) {

                    AI_LOGGER(AI_SEVERITY_WARNING, "Size of starting value function is incorrect, ignoring...");

                }

                // Defaulting

                v1_ = Values(S);

                v1_.setZero();

            }

            else

                v1_ = vParameter_;

        }


        unsigned timestep = 0;

        double variation = tolerance_ * 2; // Make it bigger


        Values val0;

        QFunction q = makeQFunction(S, A);

        const auto p = policy.getPolicy();


        const bool useTolerance = checkDifferentSmall(tolerance_, 0.0);

        while ( timestep < horizon_ && (!useTolerance || variation > tolerance_) ) {

            ++timestep;

            AI_LOGGER(AI_SEVERITY_DEBUG, "Processing timestep " << timestep);


            val0 = v1_;


            // We apply the discount directly on the values vector.

            v1_ *= model_.getDiscount();

            // We use the implicit reward function if it is available,

            // otherwise we use the one we computed beforehand.

            if constexpr(IsModelEigen<M>)

                q = computeQFunction(model_, v1_, model_.getRewardFunction());

            else

                q = computeQFunction(model_, v1_, immediateRewards_);


            // Compute the values for this policy

            for ( size_t s = 0; s < S; ++s )

                v1_(s) = q.row(s) * p.row(s).transpose();


            // We do this only if the tolerance specified is positive,

            // otherwise we continue for all the timesteps.

            if ( useTolerance )

                variation = (v1_ - val0).cwiseAbs().maxCoeff();

        }


        // We do not guarantee that the Value/QFunctions are the perfect

        // ones, as we stop within the input tolerance.

        return std::make_tuple(useTolerance ? variation : 0.0, std::move(v1_), std::move(q));

    }


    template <IsModel M>

    void PolicyEvaluation<M>::setTolerance(const double t) {

        if ( t < 0.0 ) throw std::invalid_argument("Tolerance must be >= 0");

        tolerance_ = t;

    }


    template <IsModel M>

    void PolicyEvaluation<M>::setHorizon(const unsigned h) {

        horizon_ = h;

    }


    template <IsModel M>

    void PolicyEvaluation<M>::setValues(Values v) {

        vParameter_ = std::move(v);

    }


    template <IsModel M>

    double PolicyEvaluation<M>::getTolerance()   const { return tolerance_; }


    template <IsModel M>

    unsigned PolicyEvaluation<M>::getHorizon() const { return horizon_; }


    template <IsModel M>

    const Values & PolicyEvaluation<M>::getValues() const { return vParameter_; }

}


#endif