1 #ifndef AI_TOOLBOX_POMDP_rPOMCP_HEADER_FILE
2 #define AI_TOOLBOX_POMDP_rPOMCP_HEADER_FILE
4 #include <unordered_map>
54 template <IsGenerativeModel M,
bool UseEntropy>
71 rPOMCP(
const M& m,
size_t beliefSize,
unsigned iterations,
double exp,
unsigned k = 500);
110 size_t sampleAction(
size_t a,
size_t o,
unsigned horizon);
180 size_t S, A, beliefSize_;
181 unsigned iterations_, maxDepth_;
190 size_t runSimulation(
unsigned horizon);
191 double simulate(
BNode & b,
size_t s,
unsigned horizon);
193 void maxBeliefNodeUpdate(
BNode * bn,
const ANode & aNode,
size_t a);
195 template <
typename Iterator>
196 Iterator findBestA(Iterator begin, Iterator end);
198 template <
typename Iterator>
199 Iterator findBestBonusA(Iterator begin, Iterator end,
unsigned count);
202 template <IsGenerativeModel M,
bool UseEntropy>
203 rPOMCP<M, UseEntropy>::rPOMCP(
const M& m,
const size_t beliefSize,
const unsigned iter,
const double exp,
const unsigned k) : model_(m), S(model_.getS()), A(model_.getA()),
204 beliefSize_(beliefSize), iterations_(iter),
205 exploration_(exp), k_(k),
208 template <IsGenerativeModel M,
bool UseEntropy>
211 graph_ =
HNode(A, beliefSize_, b, rand_);
213 return runSimulation(horizon);
216 template <IsGenerativeModel M,
bool UseEntropy>
218 auto & obs = graph_.children[a].children;
220 auto it = obs.find(o);
221 if ( it == obs.end() ) {
223 return sampleAction(
Belief(S, 1.0 / S), horizon);
231 {
BNode tmp = std::move(it->second); graph_ =
HNode(A, std::move(tmp), rand_); }
233 if ( graph_.isSampleBeliefEmpty() ) {
235 return sampleAction(
Belief(S, 1.0 / S), horizon);
238 return runSimulation(horizon);
241 template <IsGenerativeModel M,
bool UseEntropy>
243 if ( !horizon )
return 0;
247 for (
unsigned i = 0; i < iterations_; ++i )
248 simulate(graph_, graph_.sampleBelief(), 0);
250 auto begin = std::begin(graph_.children);
251 size_t bestA = std::distance(begin, findBestA(begin, std::end(graph_.children)));
255 graph_.V = graph_.children[bestA].V;
259 template <IsGenerativeModel M,
bool UseEntropy>
260 double rPOMCP<M, UseEntropy>::simulate(BNode & b,
size_t s,
unsigned depth) {
264 auto begin = std::begin(b.children);
265 size_t a = std::distance(begin, findBestBonusA(begin, std::end(b.children), b.N));
266 auto & aNode = b.children[a];
270 std::tie(s1, o, std::ignore) = model_.sampleSOR(s, a);
272 double immAndFutureRew = 0.0;
274 typename decltype(aNode.children)::iterator ot;
275 bool newNode =
false;
278 ot = aNode.children.find(o);
279 if ( ot == aNode.children.end() ) {
281 std::tie(ot, std::ignore) = aNode.children.insert(std::make_pair(o, BNode()));
286 ot->second.updateBeliefAndKnowledge(s1);
289 if ( depth + 1 < maxDepth_ && !model_.isTerminal(s1) && !newNode) {
290 ot->second.children.resize(A);
291 immAndFutureRew = simulate( ot->second, s1, depth + 1 );
297 if ( depth + 1 >= maxDepth_ )
298 immAndFutureRew = ot->second.getKnowledgeMeasure();
304 aNode.V += ( immAndFutureRew - aNode.V ) /
static_cast<double>(aNode.N);
309 if ( depth == 0 )
return 0.0;
319 b.actionsV = HUGE_VAL;
322 maxBeliefNodeUpdate(&b, aNode, a);
325 b.actionsV += ( immAndFutureRew - b.actionsV ) /
static_cast<double>(b.N);
332 b.V = model_.getDiscount() * b.actionsV + b.getKnowledgeMeasure();
334 return (b.N - 1)*(b.V - oldV) + b.V;
337 template <IsGenerativeModel M,
bool UseEntropy>
338 void rPOMCP<M, UseEntropy>::maxBeliefNodeUpdate(BNode * bp,
const ANode & aNode,
const size_t a) {
341 if ( aNode.V >= b.actionsV ) {
342 b.actionsV = aNode.V;
346 else if ( a == b.bestAction ) {
347 auto begin = std::begin(b.children);
348 auto it = findBestA(begin, std::end(b.children));
350 b.bestAction = std::distance(begin, it);
354 template <IsGenerativeModel M,
bool UseEntropy>
355 template <
typename Iterator>
356 Iterator rPOMCP<M, UseEntropy>::findBestA(
const Iterator begin,
const Iterator end) {
357 return std::max_element(begin, end, [](
const ANode & lhs,
const ANode & rhs){
return lhs.V < rhs.V; });
360 template <IsGenerativeModel M,
bool UseEntropy>
361 template <
typename Iterator>
362 Iterator rPOMCP<M, UseEntropy>::findBestBonusA(Iterator begin,
const Iterator end,
const unsigned count) {
365 double logCount = std::log(count + 1.0);
368 auto evaluationFunction = [
this, logCount](
const ANode & an){
369 return an.V + exploration_ * std::sqrt( logCount / an.N );
372 auto bestIterator = begin++;
373 double bestValue = evaluationFunction(*bestIterator);
375 for ( ; begin < end; ++begin ) {
376 double actionValue = evaluationFunction(*begin);
377 if ( actionValue > bestValue ) {
378 bestValue = actionValue;
379 bestIterator = begin;
386 template <IsGenerativeModel M,
bool UseEntropy>
388 beliefSize_ = beliefSize;
391 template <IsGenerativeModel M,
bool UseEntropy>
396 template <IsGenerativeModel M,
bool UseEntropy>
401 template <IsGenerativeModel M,
bool UseEntropy>
406 template <IsGenerativeModel M,
bool UseEntropy>
411 template <IsGenerativeModel M,
bool UseEntropy>
416 template <IsGenerativeModel M,
bool UseEntropy>
421 template <IsGenerativeModel M,
bool UseEntropy>