1 #ifndef AI_TOOLBOX_BANDIT_Q_SOFTMAX_POLICY_WRAPPER_HEADER_FILE
2 #define AI_TOOLBOX_BANDIT_Q_SOFTMAX_POLICY_WRAPPER_HEADER_FILE
21 template <
typename V,
typename Gen>
78 std::vector<size_t> & buffer_;
83 template <
typename V,
typename Gen>
87 template <
typename V,
typename Gen>
90 template <
typename V,
typename Gen>
92 : temperature_(t), q_(std::move(q)), valueBuffer_(vb), buffer_(buffer), rand_(gen)
94 assert(
static_cast<size_t>(q_.size()) == buffer_.size());
97 template <
typename V,
typename Gen>
101 return wrap.sampleAction();
104 valueBuffer_ = (q_ / temperature_).array().exp();
106 unsigned infinities = 0;
107 for (
size_t a = 0; a < buffer_.size(); ++a )
108 if ( std::isinf(valueBuffer_(a)) )
109 buffer_[infinities++] = a;
112 auto pickDistribution = std::uniform_int_distribution<unsigned>(0, infinities-1);
113 unsigned selection = pickDistribution(rand_);
115 return buffer_[selection];
117 valueBuffer_ /= valueBuffer_.sum();
123 template <
typename V,
typename Gen>
127 return wrap.getActionProbability(a);
130 valueBuffer_ = (q_ / temperature_).array().exp();
132 bool isAInfinite =
false;
133 unsigned infinities = 0;
134 for (
size_t aa = 0; aa < buffer_.size(); ++aa ) {
135 if ( std::isinf(valueBuffer_(aa)) ) {
137 isAInfinite |= (aa == a);
141 if ( isAInfinite )
return 1.0 / infinities;
144 return valueBuffer_(a) / valueBuffer_.sum();
147 template <
typename V,
typename Gen>
148 template <
typename P>
152 return wrap.getPolicy(p);
155 p = (q_ / temperature_).array().exp();
157 unsigned infinities = 0;
159 for (
size_t a = 0; a < buffer_.size(); ++a ) {
161 if ( std::isinf(p[a]) )
166 p = p.array().isInf().template cast<double>() / infinities;
168 p.fill(1.0 / buffer_.size());