AI-Toolbox/ThompsonModel_8hpp_source.html

#ifndef AI_TOOLBOX_MDP_THOMPSON_MODEL_HEADER_FILE

#define AI_TOOLBOX_MDP_THOMPSON_MODEL_HEADER_FILE


#include <tuple>

#include <random>


#include <AIToolbox/Types.hpp>

#include <AIToolbox/MDP/Types.hpp>

#include <AIToolbox/MDP/TypeTraits.hpp>

#include <AIToolbox/Seeder.hpp>

#include <AIToolbox/Utils/Probability.hpp>


namespace AIToolbox::MDP {

    template <IsExperience E>

    class ThompsonModel {

        public:

            using TransitionMatrix   = Matrix3D;

            using RewardMatrix       = Matrix2D;


            ThompsonModel(const E & exp, double discount = 1.0);


            void setDiscount(double d);


            void sync();


            void sync(size_t s, size_t a);


            std::tuple<size_t, double> sampleSR(size_t s, size_t a) const;


            size_t getS() const;


            size_t getA() const;


            double getDiscount() const;


            const E & getExperience() const;


            double getTransitionProbability(size_t s, size_t a, size_t s1) const;


            double getExpectedReward(size_t s, size_t a, size_t s1) const;


            const TransitionMatrix & getTransitionFunction() const;


            const Matrix2D & getTransitionFunction(size_t a) const;


            const RewardMatrix & getRewardFunction() const;


            bool isTerminal(size_t s) const;


        private:

            size_t S, A;

            double discount_;


            const E & experience_;


            TransitionMatrix transitions_;

            RewardMatrix rewards_;


            mutable RandomEngine rand_;

    };


    template <IsExperience E>

    ThompsonModel<E>::ThompsonModel(const E& exp, const double discount) :

            S(exp.getS()), A(exp.getA()), experience_(exp), transitions_(A, Matrix2D(S, S)),

            rewards_(S, A), rand_(Seeder::getSeed())

    {

        setDiscount(discount);


        sync();

    }


    template <IsExperience E>

    void ThompsonModel<E>::setDiscount(const double d) {

        if ( d <= 0.0 || d > 1.0 ) throw std::invalid_argument("Discount parameter must be in (0,1]");

        discount_ = d;

    }


    template <IsExperience E>

    void ThompsonModel<E>::sync() {

        for ( size_t a = 0; a < A; ++a )

        for ( size_t s = 0; s < S; ++s )

            sync(s,a);

    }


    template <IsExperience E>

    void ThompsonModel<E>::sync(const size_t s, const size_t a) {

        if constexpr (IsExperienceEigen<E>) {

            sampleDirichletDistribution(

                // Here we add the Jeffreys prior

                //

                // Ideally this shouldn't allocate, as the casting and sum

                // should simply create a wrapper Eigen object which is passed

                // by reference, so should be still as efficient as doing it by

                // hand.

                experience_.getVisitsTable(a).row(s).array().template cast<double>() + 0.5,

                rand_, transitions_[a].row(s)

            );

        } else {

            // Sample manually

            double sum = 0.0;

            for (size_t s1 = 0; s1 < S; ++s1) {

                // Here we add the Jeffreys prior

                std::gamma_distribution<double> dist(experience_.getVisits(s, a, s1) + 0.5, 1.0);

                transitions_[a](s, s1) = dist(rand_);

                sum += transitions_[a](s, s1);

            }

            transitions_[a].row(s) /= sum;

        }


        const auto visits = experience_.getVisitsSum(s, a);

        const auto MLEReward = experience_.getReward(s, a);

        const auto M2 = experience_.getM2(s, a);

        if (visits < 2) {

            // If we don't have enough info for the STD, we revert to MLE.

            rewards_(s, a) = MLEReward;

        } else {

            std::student_t_distribution<double> dist(visits - 1);

            rewards_(s, a) = MLEReward + dist(rand_) * std::sqrt(M2 / (visits * (visits - 1)));

        }

    }


    template <IsExperience E>

    std::tuple<size_t, double> ThompsonModel<E>::sampleSR(const size_t s, const size_t a) const {

        const size_t s1 = sampleProbability(S, transitions_[a].row(s), rand_);


        return std::make_tuple(s1, rewards_(s, a));

    }


    template <IsExperience E>

    double ThompsonModel<E>::getTransitionProbability(const size_t s, const size_t a, const size_t s1) const {

        return transitions_[a](s, s1);

    }


    template <IsExperience E>

    double ThompsonModel<E>::getExpectedReward(const size_t s, const size_t a, const size_t) const {

        return rewards_(s, a);

    }


    template <IsExperience E>

    bool ThompsonModel<E>::isTerminal(const size_t s) const {

        for ( size_t a = 0; a < A; ++a )

            if ( !checkEqualSmall(1.0, transitions_[a](s, s)) )

                return false;

        return true;

    }


    template <IsExperience E>

    size_t ThompsonModel<E>::getS() const { return S; }

    template <IsExperience E>

    size_t ThompsonModel<E>::getA() const { return A; }

    template <IsExperience E>

    double ThompsonModel<E>::getDiscount() const { return discount_; }

    template <IsExperience E>

    const E & ThompsonModel<E>::getExperience() const { return experience_; }


    template <IsExperience E>

    const typename ThompsonModel<E>::TransitionMatrix & ThompsonModel<E>::getTransitionFunction() const { return transitions_; }

    template <IsExperience E>

    const typename ThompsonModel<E>::RewardMatrix &     ThompsonModel<E>::getRewardFunction()     const { return rewards_; }


    template <IsExperience E>

    const Matrix2D & ThompsonModel<E>::getTransitionFunction(const size_t a) const { return transitions_[a]; }

}


#endif