api/ann__policy_8cpp_source.html

 // -*- Mode: c++ -*-
 // copyright (c) 2004 by Christos Dimitrakakis <dimitrak@idiap.ch>
 // $Id$


 /***************************************************************************
  *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
  *   the Free Software Foundation; either version 2 of the License, or     *
  *   (at your option) any later version.                                   *
  *                                                                         *
  ***************************************************************************/

 #include <learning/ann_policy.h>

 ANN_Policy::ANN_Policy (int n_states, int n_actions, int n_hidden, real alpha, real gamma, real lambda, bool eligibility, bool softmax, real randomness, real init_eval, bool separate_actions) : DiscretePolicy (n_states, n_actions, alpha, gamma, lambda, softmax, randomness, init_eval)
 {
     this->separate_actions = separate_actions;
     this->eligibility = eligibility;
     if (eligibility) {
         message ("Using eligibility traces");
     }
     if (separate_actions) {
         message ("Separate actions");
         J = NULL;
         Ja = new ANN* [n_actions];
         JQs = new real [n_actions];
         for (int i=0; i<n_actions; i++) {
             Ja[i] = NewANN (n_states, 1);
             if (n_hidden > 0) {
                 ANN_AddHiddenLayer (Ja[i], n_hidden);
             }
             ANN_Init (Ja[i]);
             ANN_SetOutputsToLinear(Ja[i]);
             ANN_SetBatchMode(Ja[i], false);
             Ja[i]->eligibility_traces = eligibility;
             ANN_SetLambda(Ja[i],lambda*gamma);
             ANN_SetLearningRate (Ja[i], alpha);
         }
     } else {
         JQs = NULL;
         Ja = NULL;
         J = NewANN (n_states, n_actions);
         if (n_hidden > 0) {
             ANN_AddHiddenLayer (J, n_hidden);
         }
         ANN_Init (J);
         ANN_SetOutputsToLinear(J);
         ANN_SetBatchMode(J, false);
         J->eligibility_traces = eligibility;
         ANN_SetLambda(J,lambda*gamma);
         ANN_SetLearningRate (J, alpha);
     }
     ps = new real [n_states];
     delta_vector = new real [n_actions];
     J_ps_pa = 0.0;
 }

 ANN_Policy::~ANN_Policy()
 {
     delete [] ps;
     delete [] delta_vector;
     if (separate_actions) {
         for (int i=0; i<n_actions; i++) {
             DeleteANN(Ja[i]);
         }
         delete [] Ja;
     } else {
         //ANN_ShowWeights(J);
         DeleteANN (J);
     }
 }

 int ANN_Policy::SelectAction (real* s, real r, int forced_a)
 {
     int a; // selected action
     int amax; //maximum evaluated action
     real* Q_s; // pointer to evaluations for state s
     if (confidence) {
         if (separate_actions) {
             for (int i=0; i<n_actions; i++) {
                 ANN_StochasticInput (Ja[i], s);
                 JQs[i] = ANN_GetOutput(Ja[i])[0];
             }
             Q_s = JQs;
         } else {
             ANN_StochasticInput (J, s);
             Q_s = ANN_GetOutput (J);
         }
     } else {
         if (separate_actions) {
             for (int i=0; i<n_actions; i++) {
                 ANN_Input (Ja[i], s);
                 JQs[i] = ANN_GetOutput(Ja[i])[0];
             }
             Q_s = JQs;
         } else {
             ANN_Input (J, s);
             Q_s = ANN_GetOutput (J);
         }
     }
     int argmax = argMax (Q_s);

     if (forced_learning) {
         a = forced_a;
     } else if (confidence) {
         a = argmax;
     } else if (smax) {
         a = softMax (Q_s);
         //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
     } else {
         a = eGreedy (Q_s);
     }

     if (a<0 || a>=n_actions) {
         fprintf (stderr, "Action %d out of bounds\n", a);
     }

     switch (learning_method) {

     case Sarsa:
         amax = a;
         break;
     case QLearning:
         amax = argmax;
         break;
     default:
         amax = a;
         fprintf (stderr, "Unknown learning method\n");
     }
     if (pa>=0) { // do not update at start of episode
         real delta = r + gamma*Q_s[amax] - J_ps_pa;
         tdError = delta;
         for (int j=0; j<n_actions; j++) {
             delta_vector[j] = 0.0;
         }
         if (separate_actions) {
             if (eligibility) {
                 delta_vector[0] = 1.0;
                 ANN_Delta_Train (Ja[pa], delta_vector, delta);
                 // Reset other actions' traces.
                 for (int i=0; i<n_actions; i++) {
                     if (i!=pa) {
                         ANN_Reset(Ja[i]);
                     }
                 }
             } else {
                 delta_vector[0] = delta;
                 ANN_Delta_Train (Ja[pa], delta_vector, 0.0);
             }
         } else {
             if (J->eligibility_traces) {
                 delta_vector[pa] = 1.0;
                 ANN_Delta_Train (J, delta_vector, delta);
             } else {
                 delta_vector[pa] = delta;
                 ANN_Delta_Train (J, delta_vector, 0.0);
             }
         }


     }

     //printf ("%d %d #STATE\n", min_el_state, max_el_state);
     //  printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
     //          ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);

     J_ps_pa = Q_s[a];
     pa = a;

     return a;
 }

 void ANN_Policy::Reset()
 {
     if (separate_actions) {
         for (int i=0; i<n_actions; i++) {
             ANN_Reset (Ja[i]);
         }
     } else {
         ANN_Reset(J);
     }
 }

 bool ANN_Policy::useConfidenceEstimates (bool confidence, real zeta) {
     this->confidence = confidence;
     this->zeta = zeta;
     if (separate_actions) {
         for (int i=0; i<n_actions; i++) {
             ANN_SetZeta(Ja[i], zeta);
         }
     } else {
         ANN_SetZeta(J, zeta);
     }
     if (confidence) {
         logmsg ("#+[CONDIFENCE]\n");
     } else {
         logmsg ("#-[CONDIFENCE]\n");
     }

     return confidence;
 }
DiscretePolicy::forced_learning
bool forced_learning
Force agent to take supplied action.
Definition: policy.h:173

DiscretePolicy::softMax
int softMax(real *Qs)
Softmax Gibbs sampling.
Definition: policy.cpp:783

DiscretePolicy::confidence
bool confidence
Confidence estimates option.
Definition: policy.h:174

ANN_Init
int ANN_Init(ANN *ann)
Initialise neural network.
Definition: ANN.cpp:346

DiscretePolicy::r
real r
reward
Definition: policy.h:158

ANN_Input
real ANN_Input(ANN *ann, real *x)
Give an input vector to the neural network.
Definition: ANN.cpp:406

DiscretePolicy::tdError
real tdError
temporal difference error
Definition: policy.h:160

ANN_::eligibility_traces
bool eligibility_traces
use eligibility
Definition: ANN.h:102

ANN_Policy::useConfidenceEstimates
virtual bool useConfidenceEstimates(bool confidence, real zeta=0.01)
Set to use confidence estimates for action selection, with variance smoothing zeta.
Definition: ann_policy.cpp:188

ANN_Policy::Reset
virtual void Reset()
Reset eligibility traces.
Definition: ann_policy.cpp:175

Sarsa
Definition: policy.h:140

ANN_Policy::~ANN_Policy
virtual ~ANN_Policy()
Definition: ann_policy.cpp:60

DiscretePolicy::eGreedy
int eGreedy(real *Qs)
e-greedy sampling
Definition: policy.cpp:802

ANN_Policy::SelectAction
virtual int SelectAction(real *s, real r, int forced_a=-1)
Select an action, given a vector of real numbers which represents the state.
Definition: ann_policy.cpp:75

ANN_Policy::J
ANN * J
Evaluation network.
Definition: ann_policy.h:38

ANN_SetLearningRate
void ANN_SetLearningRate(ANN *ann, real a)
Set the learning rate to a.
Definition: ANN.cpp:856

DiscretePolicy::n_actions
int n_actions
number of actions
Definition: policy.h:150

ANN_Policy::ps
real * ps
Previous state vector.
Definition: ann_policy.h:40

ANN_Policy::ANN_Policy
ANN_Policy(int n_states, int n_actions, int n_hidden=0, real alpha=0.1, real gamma=0.8, real lambda=0.8, bool eligibility=false, bool softmax=false, real randomness=0.1, real init_eval=0.0, bool separate_actions=false)
Make a new policy.
Definition: ann_policy.cpp:17

ann_policy.h

QLearning
Definition: policy.h:140

ANN_Policy::eligibility
bool eligibility
eligibility option
Definition: ann_policy.h:44

ANN_SetZeta
void ANN_SetZeta(ANN *ann, real zeta)
Set zeta, parameter variance smoothing.
Definition: ANN.cpp:890

DeleteANN
int DeleteANN(ANN *ann)
Delete a neural network.
Definition: ANN.cpp:77

DiscretePolicy::alpha
real alpha
learning rate
Definition: policy.h:166

ANN_Reset
void ANN_Reset(ANN *ann)
Resets the eligbility traces and batch updates.
Definition: ANN.cpp:379

ANN_StochasticInput
real ANN_StochasticInput(ANN *ann, real *x)
Stochastically generate an output, depending on parameter distributions.
Definition: ANN.cpp:429

ANN_GetOutput
real * ANN_GetOutput(ANN *ann)
Get the output for the current input.
Definition: ANN.cpp:824

ANN_AddHiddenLayer
int ANN_AddHiddenLayer(ANN *ann, int n_nodes)
Add a hidden layer with n_nodes.
Definition: ANN.cpp:111

DiscretePolicy::n_states
int n_states
number of states
Definition: policy.h:149

DiscretePolicy::learning_method
enum LearningMethod learning_method
learning method to use;
Definition: policy.h:148

DiscretePolicy::smax
bool smax
softmax option
Definition: policy.h:161

NewANN
ANN * NewANN(int n_inputs, int n_outputs)
Create a new ANN.
Definition: ANN.cpp:25

DiscretePolicy::argMax
int argMax(real *Qs)
Get ID of maximum action.
Definition: policy.cpp:816

DiscretePolicy::lambda
real lambda
Eligibility trace decay.
Definition: policy.h:165

ANN_Policy::JQs
real * JQs
Placeholder for evaluation vector (separate_actions)
Definition: ann_policy.h:41

DiscretePolicy::gamma
real gamma
Future discount parameter.
Definition: policy.h:164

ANN_SetBatchMode
void ANN_SetBatchMode(ANN *ann, bool batch)
Set batch updates.
Definition: ANN.cpp:906

DiscretePolicy::pa
int pa
previous action
Definition: policy.h:157

ANN_Policy::separate_actions
bool separate_actions
Single/separate evaluation option.
Definition: ann_policy.h:45

ANN_SetLambda
void ANN_SetLambda(ANN *ann, real lambda)
Set lambda, eligibility decay.
Definition: ANN.cpp:872

ANN_Policy::J_ps_pa
real J_ps_pa
Evaluation of last action.
Definition: ann_policy.h:42

ANN_Delta_Train
real ANN_Delta_Train(ANN *ann, real *delta, real TD)
Minimise a custom cost function.
Definition: ANN.cpp:584

DiscretePolicy
Discrete policies with reinforcement learning.
Definition: policy.h:144

ANN_Policy::delta_vector
real * delta_vector
Scratch vector for TD error.
Definition: ann_policy.h:43

real
float real
Definition: real.h:13

ANN_SetOutputsToLinear
void ANN_SetOutputsToLinear(ANN *ann)
Set outputs to linear.
Definition: ANN.cpp:1033

ANN_
ANN management structure.
Definition: ANN.h:89

ANN_Policy::Ja
ANN ** Ja
Evaluation networks (for separate_actions case)
Definition: ann_policy.h:39

logmsg
#define logmsg
Definition: learn_debug.h:17

message
void message(const char *msg,...)
Prints a message.
Definition: string_utils.cpp:78

DiscretePolicy::zeta
real zeta
Confidence smoothing.
Definition: policy.h:179