AI-Toolbox/Dyna2_8hpp_source.html

#ifndef AI_TOOLBOX_MDP_DYNA2_HEADER_FILE

#define AI_TOOLBOX_MDP_DYNA2_HEADER_FILE


#include <AIToolbox/MDP/Types.hpp>

#include <AIToolbox/MDP/TypeTraits.hpp>

#include <AIToolbox/MDP/Algorithms/SARSAL.hpp>

#include <AIToolbox/Bandit/Policies/RandomPolicy.hpp>

#include <AIToolbox/MDP/Policies/BanditPolicyAdaptor.hpp>


namespace AIToolbox::MDP {

    template <IsGenerativeModel M>

    class Dyna2 {

        public:

            explicit Dyna2(const M & m, double alpha = 0.1, double lambda = 0.9, double tolerance = 0.001, unsigned n = 50);


            void stepUpdateQ(size_t s, size_t a, size_t s1, size_t a1, double rew);


            void batchUpdateQ(size_t s);


            void resetTransientLearning();


            void setInternalPolicy(PolicyInterface * p);


            void setPermanentLambda(double l);


            double getPermanentLambda() const;


            void setTransientLambda(double l);


            double getTransientLambda() const;


            void setN(unsigned n);


            unsigned getN() const;


            void setTolerance(double t);


            double getTolerance() const;


            const QFunction & getPermanentQFunction() const;


            const QFunction & getTransientQFunction() const;


            const M & getModel() const;


        private:

            unsigned N;

            const M & model_;

            SARSAL permanentLearning_;

            SARSAL transientLearning_;

            std::unique_ptr<PolicyInterface> internalPolicy_;

    };


    template <IsGenerativeModel M>

    Dyna2<M>::Dyna2(const M & m, const double alpha, const double lambda, const double tolerance, const unsigned n) :

            N(n), model_(m),

            permanentLearning_(model_, alpha, lambda, tolerance),

            transientLearning_(model_, alpha, lambda, tolerance),

            internalPolicy_(new BanditPolicyAdaptor<Bandit::RandomPolicy>(model_.getS(), model_.getA()))

    {

    }


    template <IsGenerativeModel M>

    void Dyna2<M>::stepUpdateQ(const size_t s, const size_t a, const size_t s1, const size_t a1, const double rew) {

        // We copy the traces from the permanent SARSAL to the transient one so

        // that they will update their respective QFunctions in (nearly) the

        // same way.

        //

        // Note that this is not quite the same as it is stated in the paper.

        // Normally one would update only permanentLearning_, and transfer the

        // exact same changes directly to the QFunction of transientLearning_.

        //

        // They differ since the QFunction inside each method are different,

        // and so the updates won't exactly match. At the same time, after each

        // reset (or end of episodes) the transient memory should reset to the

        // permanent one, so this minor differences should go away.

        //

        // Ideally one would update directly the two QFunctions here, but this

        // would basically require re-implementing SARSAL both here and in the

        // batchUpdateQ method, which we avoid here for practicality.

        transientLearning_.setTraces(permanentLearning_.getTraces());

        permanentLearning_.stepUpdateQ(s, a, s1, a1, rew);

        transientLearning_.stepUpdateQ(s, a, s1, a1, rew);

    }


    template <IsGenerativeModel M>

    void Dyna2<M>::batchUpdateQ(const size_t initS) {

        // This clearing may not be needed if this is called after stepUpdateQ

        // with the same s1 (since the set traces there will be correct then).

        // We do it anyway in case this method is called in different settings

        // and/or multiple times in a row.

        transientLearning_.clearTraces();


        size_t s = initS;

        size_t a = internalPolicy_->sampleAction(s);

        for ( unsigned i = 0; i < N; ++i ) {

            const auto [s1, rew] = model_.sampleSR(s, a);

            const size_t a1 = internalPolicy_->sampleAction(s1);


            transientLearning_.stepUpdateQ(s, a, s1, a1, rew);


            if (model_.isTerminal(s1)) {

                s = initS;

                a = internalPolicy_->sampleAction(s);

            } else {

                s = s1;

                a = a1;

            }

        }

    }


    template <IsGenerativeModel M>

    void Dyna2<M>::resetTransientLearning() {

        transientLearning_.setQFunction(permanentLearning_.getQFunction());

    }

    template <IsGenerativeModel M>

    void Dyna2<M>::setInternalPolicy(PolicyInterface * p) {

        internalPolicy_.reset(p);

    }


    template <IsGenerativeModel M>

    unsigned Dyna2<M>::getN() const {

        return N;

    }


    template <IsGenerativeModel M>

    void Dyna2<M>::setTolerance(const double t) {

        transientLearning_.setTolerance(t);

        permanentLearning_.setTolerance(t);

    }


    template <IsGenerativeModel M>

    double Dyna2<M>::getTolerance() const {

        return permanentLearning_.getTolerance();

    }


    template <IsGenerativeModel M>

    const QFunction & Dyna2<M>::getPermanentQFunction() const {

        return permanentLearning_.getQFunction();

    }


    template <IsGenerativeModel M>

    const QFunction & Dyna2<M>::getTransientQFunction() const {

        return transientLearning_.getQFunction();

    }


    template <IsGenerativeModel M>

    const M & Dyna2<M>::getModel() const {

        return model_;

    }


    template <IsGenerativeModel M>

    void Dyna2<M>::setPermanentLambda(double l) { permanentLearning_.setLambda(l); }

    template <IsGenerativeModel M>

    double Dyna2<M>::getPermanentLambda() const { return permanentLearning_.getLambda(); }

    template <IsGenerativeModel M>

    void Dyna2<M>::setTransientLambda(double l) { transientLearning_.setLambda(l); }

    template <IsGenerativeModel M>

    double Dyna2<M>::getTransientLambda() const { return transientLearning_.getLambda(); }

}


#endif