27 #define logmsg empty_log 43 real gamma,
real lambda,
bool softmax,
76 logmsg (
"#Making Sarsa(lambda) ");
82 logmsg (
" policy with Q:[%d x %d] -> R, a:%f g:%f, l:%f, t:%f\n",
83 this->n_states, this->n_actions, this->alpha, this->gamma, this->lambda, this->
temp);
141 fprintf (f,
"%f ",
Q[s][a]);
144 fprintf (f,
"%f ",
P[s][a]);
147 fprintf (f,
"%f ",
vQ[s][a]);
158 FILE* f = fopen (
"/tmp/discrete",
"wb");
171 fprintf (f,
"%f ",
Q[s][a]);
174 fprintf (f,
"%f ",
P[s][a]);
177 fprintf (f,
"%f ",
vQ[s][a]);
187 logmsg (
"#Expected return of greedy policy over random distribution of states: %f\n", sum/((
real)
n_states));
289 if ((
ps>=0)&&(
pa>=0)) {
295 real max_estimate = 0.0;
296 real max_estimate_k = 0.0;
298 max_estimate +=
Q[i][
argMax (
Q[i])];
299 max_estimate_k += 1.0;
303 logmsg (
"%f %f %f %f#rTVV\n",
307 max_estimate/max_estimate_k);
317 P[s][argmax] +=
zeta*(1.0-
P[s][argmax]);
320 P[s][j] +=
zeta*(0.0-
P[s][j]);
345 fprintf (stderr,
"No action selected with pursuit!\n");
369 fprintf (stderr,
"Action %d out of bounds.. ", a);
371 fprintf (stderr,
"mapping to %d\n", a);
392 EQ_s +=
eval[i] *
Q[s][i];
398 fprintf (stderr,
"Unknown learning method\n");
400 if ((
ps>=0)&&(
pa>=0)) {
410 real variance_threshold = 0.0001f;
413 if (
vQ[
ps][
pa]<variance_threshold) {
414 vQ[
ps][
pa]=variance_threshold;
426 Q[i][j] += ad *
e[i][j];
429 vQ[i][j] = (1.0 - zeta_el)*
vQ[i][j] + zeta_el*(ad*ad);
430 if (
vQ[i][j]<variance_threshold) {
431 vQ[i][j]=variance_threshold;
438 if ((fabs (
Q[i][j])>1000.0)||(isnan(
Q[i][j]))) {
439 printf (
"u: %d %d %f %f\n", i,j,
Q[i][j], ad *
e[i][j]);
487 fh = fopen (f,
"rb");
489 fprintf (stderr,
"Failed to read file %s\n", f);
493 const char* start_tag=
"QSA";
494 const char* close_tag=
"END";
495 int n_read_states, n_read_actions;
497 fread((
void *) rtag,
sizeof (
char), strlen (start_tag)+1, fh);
498 if (strcmp (rtag, start_tag)) {
499 fprintf (stderr,
"Could not find starting tag\n");
502 fread((
void *) &n_read_states,
sizeof(
int), 1, fh);
503 fread((
void *) &n_read_actions,
sizeof(
int), 1, fh);
506 fprintf (stderr,
"File has %dx%d space! Aborting read.\n", n_read_states, n_read_actions);
515 if ((fabs (
Q[i][j])>100.0)||(isnan(
Q[i][j]))) {
516 printf (
"l: %d %d %f\n", i,j,
Q[i][j]);
528 P[i][argmax] += 0.001*(1.0-
P[i][argmax]);
531 P[i][j] += 0.001*(0.0-
P[i][j]);
538 fread((
void *) rtag,
sizeof (
char), strlen (close_tag)+1, fh);
539 if (strcmp (rtag, close_tag)) {
540 fprintf (stderr,
"Could not find ending tag\n");
552 fh = fopen (f,
"wb");
554 fprintf (stderr,
"Failed to write to file %s\n", f);
558 const char* start_tag=
"QSA";
559 const char* close_tag=
"END";
561 fwrite((
void *) start_tag,
sizeof (
char), strlen (start_tag)+1, fh);
562 fwrite((
void *) &
n_states,
sizeof(
int), 1, fh);
563 fwrite((
void *) &
n_actions,
sizeof(
int), 1, fh);
567 if ((fabs (
Q[i][j])>100.0)||(isnan(
Q[i][j]))) {
568 printf (
"s: %d %d %f\n", i,j,
Q[i][j]);
572 fwrite((
void *) close_tag,
sizeof (
char), strlen (start_tag)+1, fh);
589 logmsg (
"#+[CONDIFENCE]");
591 logmsg (
"#-[CONDIFENCE]\n");
600 logmsg (
"#[Q-learning]\n");
606 logmsg (
"#[E-learning]\n");
633 logmsg (
"#[REPLACING TRACES]\n");
635 logmsg (
"#[ACCUMULATING TRACES]\n");
688 logmsg(
"#[SINGULAR CONFIDENCE]\n");
break;
690 logmsg(
"#[BOUNDED CONFIDENCE]\n");
break;
692 logmsg(
"#[GAUSSIAN CONFIDENCE]\n");
break;
694 logmsg(
"#[LAPLACIAN CONFIDENCE]\n");
break;
696 Serror (
"Unknown type %d\n", cd);
707 logmsg (
"#+[GIBBS CONFIDENCE]\n");
709 logmsg (
"#-[GIBBS CONFIDENCE]\n");
720 eval[a] = exp(pow(Qs[a]/sqrt(vQs[a]),
p));
730 cum += exp ((Qs[j]-
Q)/sqrt(vQs[j]));
745 fprintf (stderr,
"ConfMax: No action selected! %f %f %f\n",
X,dsum,sum);
788 eval[a] = exp(beta * Qs[a]);
799 fprintf (stderr,
"softMax: No action selected! %f %f %f\nT:%f\n",
X,dsum,sum,
temp);
bool forced_learning
Force agent to take supplied action.
int softMax(real *Qs)
Softmax Gibbs sampling.
virtual void setPursuit(bool pursuit)
Use Pursuit for action selection.
bool confidence_uses_gibbs
Additional gibbs sampling for confidence.
bool confidence
Confidence estimates option.
Scalar max(Scalar x, Scalar y)
real tdError
temporal difference error
real * sample
sampling output
real ** P
pursuit action probabilities
virtual ~DiscretePolicy()
Kill the agent and free everything.
virtual void setVariance(real var)
Laplacian probability distribution.
virtual void setMean(real mean)
int max_el_state
max state ID to search for eligibility
real * eval
evaluation of current aciton
virtual void setSarsa()
Set the algorithm to SARSA mode.
int eGreedy(real *Qs)
e-greedy sampling
virtual void saveFile(char *f)
Save policy to a file.
bool pursuit
pursuit option
virtual void Reset()
Use at the end of every episode, after agent has entered the absorbing state.
int n_actions
number of actions
real expected_r
Expected reward.
real ** vQ
variance estimate for Q
bool replacing_traces
Replacing instead of accumulating traces.
virtual void setELearning()
Set the algorithm to ELearning mode.
int n_samples
number of samples for above expected r and V
void empty_log(const char *s,...)
virtual void useSoftmax(bool softmax)
Set action selection to softmax.
virtual void setForcedLearning(bool forced)
Set forced learning (force-feed actions)
virtual void loadFile(char *f)
Load policy from a file.
enum ConfidenceDistribution confidence_distribution
Distribution to use for confidence sampling.
int n_states
number of states
enum LearningMethod learning_method
learning method to use;
virtual void useGibbsConfidence(bool gibbs)
Add Gibbs sampling for confidences.
virtual void setGamma(real gamma)
Set the gamma of the sum to be maximised.
ConfidenceDistribution
Types of confidence distributions.
int confMax(real *Qs, real *vQs, real p=1.0)
Confidence-based Gibbs sampling.
virtual void setConfidenceDistribution(enum ConfidenceDistribution cd)
Set the distribution for direct action sampling.
bool confidence_eligibility
Apply eligibility traces to confidence.
Gaussian probability distribution.
virtual void setQLearning()
Set the algorithm to QLearning mode.
int argMax(real *Qs)
Get ID of maximum action.
real lambda
Eligibility trace decay.
void Normalise(real *src, real *dst, int n_elements)
Normalise a vector to a destination vector (low level)
virtual void setRandomness(real epsilon)
Set randomness for action selection. Does not affect confidence mode.
virtual void useReliabilityEstimate(bool ri)
Use the reliability estimate method for action selection.
real gamma
Future discount parameter.
virtual bool useConfidenceEstimates(bool confidence, real zeta=0.01, bool confidence_eligibility=false)
Set to use confidence estimates for action selection, with variance smoothing zeta.
int confSample(real *Qs, real *vQs)
Directly sample from action value distribution.
virtual void setMean(real mean)
real ** e
eligibility trace
virtual int SelectAction(int s, real r, int forced_a=-1)
Select an action a, given state s and reward from previous action.
real expected_V
Expected state return.
real ** Q
state-action evaluation
virtual void setVariance(real var)
virtual void saveState(FILE *f)
Save the current evaluations in text format to a file.
int min_el_state
min state ID to search for eligibility
DiscretePolicy(int n_states, int n_actions, real alpha=0.1, real gamma=0.8, real lambda=0.8, bool softmax=false, real randomness=0.1, real init_eval=0.0)
Create a new discrete policy.
bool reliability_estimate
reliability estimates option
virtual void setReplacingTraces(bool replacing)
Use Pursuit for action selection.
real zeta
Confidence smoothing.