qlearning.h

Go to the documentation of this file.
00001 /***************************************************************************
00002  *   Copyright (C) 2005-2011 LpzRobots development team                    *
00003  *    Georg Martius  <georg dot martius at web dot de>                     *
00004  *    Frank Guettler <guettler at informatik dot uni-leipzig dot de        *
00005  *    Frank Hesse    <frank at nld dot ds dot mpg dot de>                  *
00006  *    Ralf Der       <ralfder at mis dot mpg dot de>                       *
00007  *                                                                         *
00008  *   This program is free software; you can redistribute it and/or modify  *
00009  *   it under the terms of the GNU General Public License as published by  *
00010  *   the Free Software Foundation; either version 2 of the License, or     *
00011  *   (at your option) any later version.                                   *
00012  *                                                                         *
00013  *   This program is distributed in the hope that it will be useful,       *
00014  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00015  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00016  *   GNU General Public License for more details.                          *
00017  *                                                                         *
00018  *   You should have received a copy of the GNU General Public License     *
00019  *   along with this program; if not, write to the                         *
00020  *   Free Software Foundation, Inc.,                                       *
00021  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
00022  *                                                                         *
00023  ***************************************************************************/
00024 #ifndef __QLEARNING_H
00025 #define __QLEARNING_H
00026 
00027 #include "matrix.h"
00028 #include "configurable.h"
00029 #include "storeable.h"
00030 #include "randomgenerator.h"
00031 
00032 /// implements QLearning
00033 class QLearning : public Configurable, public Storeable {
00034 public: 
00035   /**
00036      \param eps learning rate (typically 0.1)
00037      \param discount discount factor for Q-values (typically 0.9)
00038      \param exploration exploration rate (typically 0.02)
00039      \param eligibility number of steps to update backwards in time
00040      \param random_initQ if true Q table is filled with small random numbers at the start (default: false)
00041      \param useSARSA if true, use SARSA strategy otherwise qlearning (default: false)
00042      \param tau number of time steps to average over reward for col_rew 
00043    */ 
00044   QLearning(double eps, double discount, double exploration, int eligibility, 
00045             bool random_initQ = false, bool useSARSA = false, int tau=1000); 
00046   
00047   virtual ~QLearning();
00048 
00049   /** initialisation with the given number of action and states
00050       @param actionDim number of actions
00051       @param stateDim number of states
00052       @param unit_map if 0 the parametes are choosen randomly. 
00053       Otherwise the model is initialised to represent a unit_map with the given response strength.
00054   */
00055   virtual void init(unsigned  int stateDim, unsigned int actionDim, RandGen* randGen = 0);
00056 
00057   /** selection of action given current state.
00058       The policy is to take the actions with the highest value,
00059       or a random action at the rate of exploration
00060   */
00061   virtual unsigned int select (unsigned int state);
00062 
00063   /** selection of action given current state.
00064       The policy is to sample from the above average actions, with bias
00065       to the old action (also exploration included).
00066   */
00067   virtual unsigned int select_sample (unsigned int state);
00068   /// select with preference to old (90% if good) and 30% second best
00069   virtual unsigned int select_keepold (unsigned int state);
00070 
00071   /* performs learning and returns current expected reward.
00072      \param state current state
00073      \param action we select in current state
00074      \param reward reinforcement we obtain in this state
00075      \param learnRateFactor can be given to modify eps for this
00076      learning step
00077   */
00078   virtual double learn (unsigned int state, 
00079                         unsigned int action,
00080                         double reward, 
00081                         double learnRateFactor = 1);
00082 
00083   /** returns the vector of values for all actions given the current state
00084    */
00085   matrix::Matrix getActionValues(unsigned int state);
00086 
00087 
00088   /** tells the q learning that the agent was reset, so that it
00089       forgets it memory. please note, that updating the Q-table is
00090       one step later, so in case of a reward you should call learn one
00091       more time before reset.
00092   */
00093   virtual void reset();
00094 
00095 
00096   /// returns the number of states
00097   virtual unsigned int getStateDim() const;
00098   /// returns the number of actions
00099   virtual unsigned int getActionDim() const;
00100 
00101   /// returns the collectedReward reward
00102   virtual double getCollectedReward() const;
00103 
00104   /// expects a list of value,range and returns the associated state
00105   static int valInCrossProd(const std::list<std::pair<int,int> >& vals);
00106   
00107   /// expects a list of ranges and a state/action and return the configuration
00108   static std::list<int> ConfInCrossProd(const std::list<int>& ranges, int val);
00109 
00110   /// returns q table (mxn) == (states x actions)
00111   virtual const matrix::Matrix& getQ() const {return Q;} ;
00112 
00113   virtual bool store(FILE* f) const;
00114     
00115   virtual bool restore(FILE* f);  
00116 
00117 
00118 protected:
00119   double eps;
00120   double discount;
00121   double exploration;
00122   double eligibility; // is used as integer (only for configration)
00123   bool random_initQ;
00124 public:
00125   bool useSARSA; ///< if true, use SARSA strategy otherwise qlearning
00126 protected:
00127   int tau;       ///< time horizont for averaging the reward
00128   matrix::Matrix Q; /// < Q table (mxn) == (states x actions)
00129 
00130 
00131   int* actions;    // ring buffer for actions
00132   int* states;     // ring buffer for states
00133   double* rewards; // ring buffer for rewards
00134   int ringbuffersize; // size of ring buffers, eligibility + 1
00135   double* longrewards; // long ring buffer for rewards for collectedReward
00136   int t; // time for ring buffers
00137   bool initialised; 
00138   double collectedReward; // sum over collected reward
00139 
00140   RandGen* randGen;
00141 };
00142 
00143 
00144 #endif