TORCS  1.3.9
The Open Racing Car Simulator
policy.cpp
Go to the documentation of this file.
1 // -*- Mode: c++ -*-
2 // $Id$
3 
4 // copyright (c) 2004 by Christos Dimitrakakis <dimitrak@idiap.ch>
5 /***************************************************************************
6  * *
7  * This program is free software; you can redistribute it and/or modify *
8  * it under the terms of the GNU General Public License as published by *
9  * the Free Software Foundation; either version 2 of the License, or *
10  * (at your option) any later version. *
11  * *
12  ***************************************************************************/
13 
14 #include <cstring>
15 #include <learning/learn_debug.h>
16 #include <learning/policy.h>
17 #include <learning/MathFunctions.h>
18 #ifdef WIN32
19 #include <float.h>
20 #define isnan _isnan
21 #endif // WIN32
22 
23 #undef POLICY_LOG
24 
25 #ifndef POLICY_LOG
26 #undef logmsg
27 #define logmsg empty_log
28 #endif
29 
30 void empty_log(const char* s, ...)
31 {
32 }
42 DiscretePolicy::DiscretePolicy (int n_states, int n_actions, real alpha,
43  real gamma, real lambda, bool softmax,
44  real randomness, real init_eval)
45 {
46  if (lambda<0.0f) lambda = 0.0f;
47  if (lambda>0.99f) lambda = 0.99f;
48 
49  if (gamma<0.0f) gamma = 0.0f;
50  if (gamma>0.99f) gamma = 0.99f;
51 
52  if (alpha<0.0f) alpha = 0.0f;
53  if (alpha>1.0f) alpha = 1.0f;
54 
55  this->n_states = n_states;
56  this->n_actions = n_actions;
57  this->gamma = gamma;
58  this->lambda = lambda;
59  this->alpha = alpha;
60  smax = softmax;
61  temp = randomness;
62  //logmsg ("RR:%f", temp);
63  if (smax) {
64  if (temp<0.1f)
65  temp = 0.1f;
66  } else {
67  if (temp<0.0f) {
68  temp = 0.0f;
69  }
70  if (temp>1.0f) {
71  temp = 1.0f;
72  }
73  }
75 
76  logmsg ("#Making Sarsa(lambda) ");
77  if (smax) {
78  logmsg ("#softmax");
79  } else {
80  logmsg ("#e-greedy");
81  }
82  logmsg (" policy with Q:[%d x %d] -> R, a:%f g:%f, l:%f, t:%f\n",
83  this->n_states, this->n_actions, this->alpha, this->gamma, this->lambda, this->temp);
84 
85  P = new real* [n_states];
86  Q = new real* [n_states];
87  e = new real* [n_states];
88  vQ = new real* [n_states];
89  for (int s=0; s<n_states; s++) {
90  P[s] = new real [n_actions];
91  Q[s] = new real [n_actions];
92  e[s] = new real [n_actions];
93  vQ[s] = new real [n_actions];
94  for (int a=0; a<n_actions; a++) {
95  P[s][a] = 1.0/((float) n_actions);
96  Q[s][a] = init_eval;
97  e[s][a] = 0.0;
98  vQ[s][a] = 1.0;
99  }
100  }
101  pQ = 0.0;
102  ps = -1;
103  pa = -1;
104  min_el_state = 0;
105  max_el_state = n_states -1;
106  eval = new real[n_actions];
107  sample = new real[n_actions];
108  for (int a=0; a<n_actions; a++) {
109  eval[a] = 0.0;
110  sample[a] = 0.0;
111  }
112  forced_learning = false;
113  confidence = false;
114  confidence_uses_gibbs = true;
116  zeta = 0.01f;
117  tdError = 0.0f;
118  expected_r = 0.0f;
119  expected_V = 0.0f;
120  n_samples = 0;
121  replacing_traces = false;
122 }
123 
129 {
130  if (!f)
131  return;
132  for (int s=0; s<n_states; s++) {
133 
134  //softMax(Q[s]);
135  real sum2=0.0;
136  int a;
137  for (a=0; a<n_actions; a++) {
138  sum2 += eval[a];
139  }
140  for (a=0; a<n_actions; a++) {
141  fprintf (f, "%f ", Q[s][a]);
142  }
143  for (a=0; a<n_actions; a++) {
144  fprintf (f, "%f ", P[s][a]);
145  }
146  for (a=0; a<n_actions; a++) {
147  fprintf (f, "%f ", vQ[s][a]);
148  }
149  }
150 
151  fprintf (f, "\n");
152 }
153 
156 {
157  real sum = 0.0;
158  FILE* f = fopen ("/tmp/discrete","wb");
159 
160  int s;
161  for (s=0; s<n_states; s++) {
162  sum += Q[s][argMax(Q[s])];
163  if (f) {
164  //softMax(Q[s]);
165  real sum2=0.0;
166  int a;
167  for (a=0; a<n_actions; a++) {
168  sum2 += eval[a];
169  }
170  for (a=0; a<n_actions; a++) {
171  fprintf (f, "%f ", Q[s][a]);
172  }
173  for (a=0; a<n_actions; a++) {
174  fprintf (f, "%f ", P[s][a]);
175  }
176  for (a=0; a<n_actions; a++) {
177  fprintf (f, "%f ", vQ[s][a]);
178  }
179  fprintf (f, "\n");
180  }
181  }
182 
183  if (f) {
184  fclose (f);
185  }
186 
187  logmsg ("#Expected return of greedy policy over random distribution of states: %f\n", sum/((real) n_states));
188 
189  for (s=0; s<n_states; s++) {
190  delete [] P[s];
191  delete [] Q[s];
192  delete [] e[s];
193  delete [] vQ[s];
194  }
195  delete [] P;
196  delete [] Q;
197  delete [] vQ;
198  delete [] e;
199  delete [] eval;
200  delete [] sample;
201 }
202 
283 int DiscretePolicy::SelectAction (int s, real r, int forced_a)
284 {
285  if ((s<0)||(s>=n_states)) {
286  return 0;
287  }
288 
289  if ((ps>=0)&&(pa>=0)) {
290  expected_r += r;
291  expected_V += Q[ps][pa];
292  n_samples++;
293 
294  if (s==0) {
295  real max_estimate = 0.0;
296  real max_estimate_k = 0.0;
297  for (int i=0; i<n_states; i++) {
298  max_estimate += Q[i][argMax (Q[i])];
299  max_estimate_k += 1.0;
300  }
301 
302 #if 0
303  logmsg ("%f %f %f %f#rTVV\n",
305  temp,
307  max_estimate/max_estimate_k);
308 #endif
309  expected_r = 0.0;
310  expected_V= 0.0;
311  n_samples = 0;
312  }
313  }
314  int a, amax;
315  int argmax = argMax (Q[s]);
316 
317  P[s][argmax] += zeta*(1.0-P[s][argmax]);
318  for (int j=0; j<n_actions; j++) {
319  if (j!=argmax) {
320  P[s][j] += zeta*(0.0-P[s][j]);
321  }
322  }
323 
324 
325 
326  if (forced_learning) {
327  a = forced_a;
328  } else if (pursuit) {
329  real sum = 0.0;
330  a = -1;
331  int j;
332  for (j=0; j<n_actions; j++) {
333  sum += P[s][j];
334  }
335  real X = urandom()*sum;
336  real dsum=0.0;
337  for (j=0; j<n_actions; j++) {
338  dsum += P[s][j];
339  if (X<=dsum) {
340  a = j;
341  break;
342  }
343  }
344  if (a==-1) {
345  fprintf (stderr, "No action selected with pursuit!\n");
346  }
347  } else if (confidence) {
349  a = confMax (Q[s],vQ[s]);
350  } else {
351  a = confSample (Q[s], vQ[s]);
352  if (confidence_uses_gibbs) { // and not SINGULAR distribution
353  a = softMax(sample); //use softmax on the sample values
354  }
355  }
356  } else if (reliability_estimate) {
357  temp = sqrt(Sum(vQ[s], n_actions)/((real) n_actions));
358  //temp = 0.1;
359  a = softMax(Q[s]);
360  //printf ("%f\n", temp);
361  } else if (smax) {
362  a = softMax (Q[s]);
363  //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
364  } else {
365  a = eGreedy (Q[s]);
366  }
367 
368  if (a<0 || a>=n_actions) {
369  fprintf (stderr, "Action %d out of bounds.. ", a);
370  a = (int) floor (urandom()*((real) n_actions));
371  fprintf (stderr, "mapping to %d\n", a);
372  }
373 
374  real EQ_s = 0.0;
375  int i;
376 
377  switch (learning_method) {
378 
379  case Sarsa:
380  amax = a;
381  EQ_s = Q[s][amax];
382  break;
383  case QLearning:
384  amax = argmax;
385  EQ_s = Q[s][amax];
386  break;
387  case ELearning:
388  amax = a; //? correct ?
390  EQ_s = 0.0;
391  for (i=0; i<n_actions; i++) {
392  EQ_s += eval[i] * Q[s][i];
393  }
394  break;
395  default:
396  amax = a;
397  EQ_s = Q[s][amax];
398  fprintf (stderr, "Unknown learning method\n");
399  }
400  if ((ps>=0)&&(pa>=0)) { // do not update at start of episode
401  real delta = r + gamma*EQ_s - Q[ps][pa];
402  tdError = delta;
403  if (replacing_traces) {
404  e[ps][pa] = 1.0;
405  } else {
406  e[ps][pa] += 1.0;
407  }
408  real ad = alpha*delta;
409  real gl = gamma * lambda;
410  real variance_threshold = 0.0001f;
411  if (confidence_eligibility == false) {
412  vQ[ps][pa] = (1.0 - zeta)*vQ[ps][pa] + zeta*(ad*ad);
413  if (vQ[ps][pa]<variance_threshold) {
414  vQ[ps][pa]=variance_threshold;
415  }
416  }
419 
420 
421  for (i=0; i<n_states; i++) {
422  //for (int i=min_el_state; i<=max_el_state; i++) {
423  bool el=true;
424  for (int j=0; j<n_actions; j++) {
425  if (e[i][j]>0.01) {
426  Q[i][j] += ad * e[i][j];
427  if (confidence_eligibility == true) {
428  real zeta_el = zeta * e[i][j];
429  vQ[i][j] = (1.0 - zeta_el)*vQ[i][j] + zeta_el*(ad*ad);
430  if (vQ[i][j]<variance_threshold) {
431  vQ[i][j]=variance_threshold;
432  }
433  }
434  //this is the same as setting e[ps][pa] += (1-P[ps][pa])
435  // if P[][] remains unchanged between updates.
436  // -- removed because it doesn't work! --
437  //P[i][j] += 0.01*delta * e[i][j] * (1.-P[i][j]);
438  if ((fabs (Q[i][j])>1000.0)||(isnan(Q[i][j]))) {
439  printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]);
440  }
441 
442  //This is only needed for Qlearning, but sarsa is not
443  //affected since always amax==a;
444  if (amax==a) {
445  e[i][j] *= gl;
446  } else {
447  e[i][j] = 0.0;
448  }
449  } else {
450  e[i][j] = 0.0;
451  el = false;
452  }
453  }
454  if (el==false) {
455  if (min_el_state==i)
456  min_el_state++;
457  } else {
458  max_el_state = i;
459  }
460  }
461  }
462 
463  //printf ("%d %d #STATE\n", min_el_state, max_el_state);
464  // printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
465  // ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);
466  ps = s;
467  pa = a;
468 
469  return a;
470 }
471 
475 {
476  for (int s=0; s<n_states; s++) {
477  for (int a=0; a<n_actions; a++) {
478  e[s][a] = 0.0;
479  }
480  }
481 }
482 
485 {
486  FILE* fh = NULL;
487  fh = fopen (f, "rb");
488  if (fh==NULL) {
489  fprintf (stderr, "Failed to read file %s\n", f);
490  return;
491  }
492  char rtag[256];
493  const char* start_tag="QSA";
494  const char* close_tag="END";
495  int n_read_states, n_read_actions;
496 
497  fread((void *) rtag, sizeof (char), strlen (start_tag)+1, fh);
498  if (strcmp (rtag, start_tag)) {
499  fprintf (stderr, "Could not find starting tag\n");
500  return;
501  }
502  fread((void *) &n_read_states, sizeof(int), 1, fh);
503  fread((void *) &n_read_actions, sizeof(int), 1, fh);
504 
505  if ((n_read_states!=n_states)||(n_read_actions!=n_actions)) {
506  fprintf (stderr, "File has %dx%d space! Aborting read.\n", n_read_states, n_read_actions);
507  fclose(fh);
508  return;
509  }
510 
511  int i, j;
512  for (i=0; i<n_states; i++) {
513  fread((void *) Q[i], sizeof(real), n_actions, fh);
514  for (j=0; j<n_actions; j++) {
515  if ((fabs (Q[i][j])>100.0)||(isnan(Q[i][j]))) {
516  printf ("l: %d %d %f\n", i,j,Q[i][j]);
517  Q[i][j] = 0.0;
518  }
519  }
520  }
521  for (i=0; i<n_states; i++) {
522  for (j=0; j<n_actions; j++) {
523  {
524  P[i][j] = 1.0/((real) n_actions);
525  }
526  }
527  int argmax = argMax (Q[i]);
528  P[i][argmax] += 0.001*(1.0-P[i][argmax]);
529  for (int j=0; j<n_actions; j++) {
530  if (j!=argmax) {
531  P[i][j] += 0.001*(0.0-P[i][j]);
532  }
533  }
534  }
535 
536 
537 
538  fread((void *) rtag, sizeof (char), strlen (close_tag)+1, fh);
539  if (strcmp (rtag, close_tag)) {
540  fprintf (stderr, "Could not find ending tag\n");
541  fclose (fh);
542  return;
543  }
544 
545 
546  fclose (fh);
547 }
548 
550 void DiscretePolicy::saveFile (char* f) {
551  FILE* fh = NULL;
552  fh = fopen (f, "wb");
553  if (fh==NULL) {
554  fprintf (stderr, "Failed to write to file %s\n", f);
555  return;
556  }
557 
558  const char* start_tag="QSA";
559  const char* close_tag="END";
560 
561  fwrite((void *) start_tag, sizeof (char), strlen (start_tag)+1, fh);
562  fwrite((void *) &n_states, sizeof(int), 1, fh);
563  fwrite((void *) &n_actions, sizeof(int), 1, fh);
564  for (int i=0; i<n_states; i++) {
565  fwrite((void *) Q[i], sizeof(real), n_actions, fh);
566  for (int j=0; j<n_actions; j++) {
567  if ((fabs (Q[i][j])>100.0)||(isnan(Q[i][j]))) {
568  printf ("s: %d %d %f\n", i,j,Q[i][j]);
569  }
570  }
571  }
572  fwrite((void *) close_tag, sizeof (char), strlen (start_tag)+1, fh);
573  fclose (fh);
574 }
575 
580 bool DiscretePolicy::useConfidenceEstimates (bool confidence, real zeta, bool confidence_eligibility) {
581  this->confidence = confidence;
582  this->zeta = zeta;
583  this->confidence_eligibility = confidence_eligibility;
584 
586  logmsg ("#+[ELIG_VAR]");
587  }
588  if (confidence) {
589  logmsg ("#+[CONDIFENCE]");
590  } else {
591  logmsg ("#-[CONDIFENCE]\n");
592  }
593 
594  return confidence;
595 }
596 
600  logmsg ("#[Q-learning]\n");
601 }
602 
606  logmsg ("#[E-learning]\n");
607 }
608 
612 {
614  logmsg ("#[Sarsa]\n");
615 }
616 
618 void DiscretePolicy::setPursuit(bool pursuit)
619 {
620  this->pursuit = pursuit;
621  if (pursuit) {
622  logmsg ("#+[PURSUIT]\n");
623  } else {
624  logmsg ("#-[PURSUIT]\n");
625  }
626 }
627 
630 {
631  this->replacing_traces = replacing;
632  if (replacing) {
633  logmsg ("#[REPLACING TRACES]\n");
634  } else {
635  logmsg ("#[ACCUMULATING TRACES]\n");
636  }
637 }
640 {
641  forced_learning = forced;
642 }
643 
646 {
647  temp = epsilon;
648  if (smax) {
649  if (temp<0.01) {
650  smax = false;
651  }
652  }
653 }
654 
657 {
658  this->gamma = gamma;
659 }
660 
662 void DiscretePolicy::useSoftmax (bool softmax)
663 {
664  smax = softmax;
665  if (smax) {
666  logmsg ("#+[SMAX]\n");
667  } else {
668  logmsg ("#-[SMAX]\n");
669  }
670 }
671 
674 {
676  if (ri) {
677  logmsg("#+[RI]\n");
678  } else {
679  logmsg("#-[RI]\n");
680  }
681 }
682 
685 {
686  switch (cd) {
687  case SINGULAR:
688  logmsg("#[SINGULAR CONFIDENCE]\n"); break;
689  case BOUNDED:
690  logmsg("#[BOUNDED CONFIDENCE]\n"); break;
691  case GAUSSIAN:
692  logmsg("#[GAUSSIAN CONFIDENCE]\n"); break;
693  case LAPLACIAN:
694  logmsg("#[LAPLACIAN CONFIDENCE]\n"); break;
695  default:
696  Serror ("Unknown type %d\n", cd);
697  }
699 }
700 
705 {
706  if (gibbs) {
707  logmsg ("#+[GIBBS CONFIDENCE]\n");
708  } else {
709  logmsg ("#-[GIBBS CONFIDENCE]\n");
710  }
711  this->confidence_uses_gibbs = gibbs;
712 }
713 
714 // ---------- action selection helpers -------------
716  real sum=0.0;
717  int a;
718 #if 0
719  for (a=0; a<n_actions; a++) {
720  eval[a] = exp(pow(Qs[a]/sqrt(vQs[a]), p));
721  sum += eval[a];
722  }
723 #else
724  for (a=0; a<n_actions; a++) {
725  real Q = Qs[a];
726  real cum = 1.0;
727  //real v = sqrt(vQs[a]);
728  for (int j=0; j<n_actions; j++) {
729  if (j!=a) {
730  cum += exp ((Qs[j]-Q)/sqrt(vQs[j]));
731  }
732  }
733  eval[a] = 1.0/(cum);//#exp(Qs[a]/sqrt(vQs[a]));
734  sum += eval[a];
735  }
736 #endif
737  real X = urandom()*sum;
738  real dsum = 0.0;
739  for (a=0; a<n_actions; a++) {
740  dsum += eval[a];
741  if (X<=dsum)
742  return a;
743 
744  }
745  fprintf (stderr, "ConfMax: No action selected! %f %f %f\n",X,dsum,sum);
746  return -1;
747 }
748 
750  static NormalDistribution gaussian;
751  static LaplacianDistribution laplacian;
752  static UniformDistribution uniform;
753 
754  for (int a=0; a<n_actions; a++) {
755  //eval[a] = Qs[a] + urandom(-1.0,1.0)*vQs[a];
756  switch(confidence_distribution) {
757  case SINGULAR:
758  sample[a] = Qs[a];
759  break;
760  case BOUNDED:
761  uniform.setMean(Qs[a]);
762  uniform.setVariance(vQs[a]);
763  sample[a] = uniform.generate();
764  break;
765  case GAUSSIAN:
766  gaussian.setMean(Qs[a]);
767  gaussian.setVariance(vQs[a]);
768  sample[a] = gaussian.generate();
769  break;
770  case LAPLACIAN:
771  laplacian.setMean(Qs[a]);
772  laplacian.setVariance(vQs[a]);
773  sample[a] = Qs[a] + laplacian.generate();
774  break;
775  default:
776  Serror ("Unknown distribution ID:%d\n", confidence_distribution);
777  break;
778  }
779  }
780  return argMax(sample);
781 }
782 
784  real sum=0.0f;
785  real beta = 1.0f/temp;
786  int a;
787  for (a=0; a<n_actions; a++) {
788  eval[a] = exp(beta * Qs[a]);
789  sum += eval[a];
790  }
791  real X = urandom()*sum;
792  real dsum = 0.0;
793  for (a=0; a<n_actions; a++) {
794  dsum += eval[a];
795  if (X<=dsum)
796  return a;
797 
798  }
799  fprintf (stderr, "softMax: No action selected! %f %f %f\nT:%f\n",X,dsum,sum,temp);
800  return -1;
801 }
803  real X = urandom();
804  int amax = argMax(Qs);
805  real base_prob = temp/((real) n_actions);
806  for (int a=0; a<n_actions; a++) {
807  eval[a] = base_prob;
808  }
809  eval[amax] += 1.0-temp;
810  if (X<temp) {
811  return rand()%n_actions;
812  }
813  return argMax(Qs);
814 }
815 
817  real max = Qs[0];
818  int arg_max = 0;
819  for (int a=1; a<n_actions; a++) {
820  if (max<Qs[a]) {
821  max = Qs[a];
822  arg_max = a;
823  }
824  }
825  return arg_max;
826 }
827 
828 
bool forced_learning
Force agent to take supplied action.
Definition: policy.h:173
int softMax(real *Qs)
Softmax Gibbs sampling.
Definition: policy.cpp:783
virtual void setPursuit(bool pursuit)
Use Pursuit for action selection.
Definition: policy.cpp:618
bool confidence_uses_gibbs
Additional gibbs sampling for confidence.
Definition: policy.h:178
bool confidence
Confidence estimates option.
Definition: policy.h:174
real r
reward
Definition: policy.h:158
Scalar max(Scalar x, Scalar y)
Definition: Basic.h:50
real tdError
temporal difference error
Definition: policy.h:160
real * sample
sampling output
Definition: policy.h:154
virtual void setVariance(real var)
Definition: Distribution.h:95
real ** P
pursuit action probabilities
Definition: policy.h:163
virtual ~DiscretePolicy()
Kill the agent and free everything.
Definition: policy.cpp:155
virtual void setVariance(real var)
Definition: Distribution.h:122
Laplacian probability distribution.
Definition: Distribution.h:103
virtual void setMean(real mean)
Definition: Distribution.h:124
Definition: policy.h:140
int max_el_state
max state ID to search for eligibility
Definition: policy.h:171
real * eval
evaluation of current aciton
Definition: policy.h:153
int ps
previous state
Definition: policy.h:156
virtual void setSarsa()
Set the algorithm to SARSA mode.
Definition: policy.cpp:611
int eGreedy(real *Qs)
e-greedy sampling
Definition: policy.cpp:802
virtual real generate()
virtual void saveFile(char *f)
Save policy to a file.
Definition: policy.cpp:550
Definition: Basic.h:58
bool pursuit
pursuit option
Definition: policy.h:162
virtual void Reset()
Use at the end of every episode, after agent has entered the absorbing state.
Definition: policy.cpp:474
int n_actions
number of actions
Definition: policy.h:150
real expected_r
Expected reward.
Definition: policy.h:167
real ** vQ
variance estimate for Q
Definition: policy.h:180
real pQ
previous Q
Definition: policy.h:155
bool replacing_traces
Replacing instead of accumulating traces.
Definition: policy.h:172
virtual real generate()
virtual void setELearning()
Set the algorithm to ELearning mode.
Definition: policy.cpp:604
virtual real generate()
int n_samples
number of samples for above expected r and V
Definition: policy.h:169
real alpha
learning rate
Definition: policy.h:166
void empty_log(const char *s,...)
Definition: policy.cpp:30
virtual void useSoftmax(bool softmax)
Set action selection to softmax.
Definition: policy.cpp:662
virtual void setMean(real mean)
Definition: Distribution.h:97
static Point p[4]
Definition: Convex.cpp:54
virtual void setForcedLearning(bool forced)
Set forced learning (force-feed actions)
Definition: policy.cpp:639
virtual void loadFile(char *f)
Load policy from a file.
Definition: policy.cpp:484
enum ConfidenceDistribution confidence_distribution
Distribution to use for confidence sampling.
Definition: policy.h:177
int n_states
number of states
Definition: policy.h:149
#define logmsg
Definition: policy.cpp:27
enum LearningMethod learning_method
learning method to use;
Definition: policy.h:148
virtual void useGibbsConfidence(bool gibbs)
Add Gibbs sampling for confidences.
Definition: policy.cpp:704
virtual void setGamma(real gamma)
Set the gamma of the sum to be maximised.
Definition: policy.cpp:656
Uniform probability distribution.
Definition: Distribution.h:81
bool smax
softmax option
Definition: policy.h:161
ConfidenceDistribution
Types of confidence distributions.
Definition: policy.h:142
int confMax(real *Qs, real *vQs, real p=1.0)
Confidence-based Gibbs sampling.
Definition: policy.cpp:715
virtual void setConfidenceDistribution(enum ConfidenceDistribution cd)
Set the distribution for direct action sampling.
Definition: policy.cpp:684
bool confidence_eligibility
Apply eligibility traces to confidence.
Definition: policy.h:175
Gaussian probability distribution.
Definition: Distribution.h:57
virtual void setQLearning()
Set the algorithm to QLearning mode.
Definition: policy.cpp:598
int argMax(real *Qs)
Get ID of maximum action.
Definition: policy.cpp:816
real Sum(real *a, int n)
real lambda
Eligibility trace decay.
Definition: policy.h:165
void Normalise(real *src, real *dst, int n_elements)
Normalise a vector to a destination vector (low level)
virtual void setRandomness(real epsilon)
Set randomness for action selection. Does not affect confidence mode.
Definition: policy.cpp:645
virtual void useReliabilityEstimate(bool ri)
Use the reliability estimate method for action selection.
Definition: policy.cpp:673
real gamma
Future discount parameter.
Definition: policy.h:164
int pa
previous action
Definition: policy.h:157
virtual bool useConfidenceEstimates(bool confidence, real zeta=0.01, bool confidence_eligibility=false)
Set to use confidence estimates for action selection, with variance smoothing zeta.
Definition: policy.cpp:580
real temp
scratch
Definition: policy.h:159
int confSample(real *Qs, real *vQs)
Directly sample from action value distribution.
Definition: policy.cpp:749
virtual void setMean(real mean)
Definition: Distribution.h:76
real ** e
eligibility trace
Definition: policy.h:152
virtual int SelectAction(int s, real r, int forced_a=-1)
Select an action a, given state s and reward from previous action.
Definition: policy.cpp:283
#define Serror
Definition: learn_debug.h:10
real expected_V
Expected state return.
Definition: policy.h:168
real urandom()
real ** Q
state-action evaluation
Definition: policy.h:151
virtual void setVariance(real var)
Definition: Distribution.h:74
float real
Definition: real.h:13
virtual void saveState(FILE *f)
Save the current evaluations in text format to a file.
Definition: policy.cpp:128
int min_el_state
min state ID to search for eligibility
Definition: policy.h:170
DiscretePolicy(int n_states, int n_actions, real alpha=0.1, real gamma=0.8, real lambda=0.8, bool softmax=false, real randomness=0.1, real init_eval=0.0)
Create a new discrete policy.
Definition: policy.cpp:42
bool reliability_estimate
reliability estimates option
Definition: policy.h:176
virtual void setReplacingTraces(bool replacing)
Use Pursuit for action selection.
Definition: policy.cpp:629
real zeta
Confidence smoothing.
Definition: policy.h:179