Robot Simulator of the Robotics Group for Self-Organization of Control: qlearning.h Source File

00001 /***************************************************************************
00002  *   Copyright (C) 2005 by Robot Group Leipzig                             *
00003  *    martius@informatik.uni-leipzig.de                                    *
00004  *    fhesse@informatik.uni-leipzig.de                                     *
00005  *    der@informatik.uni-leipzig.de                                        *
00006  *                                                                         *
00007  *   This program is free software; you can redistribute it and/or modify  *
00008  *   it under the terms of the GNU General Public License as published by  *
00009  *   the Free Software Foundation; either version 2 of the License, or     *
00010  *   (at your option) any later version.                                   *
00011  *                                                                         *
00012  *   This program is distributed in the hope that it will be useful,       *
00013  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00014  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00015  *   GNU General Public License for more details.                          *
00016  *                                                                         *
00017  *   You should have received a copy of the GNU General Public License     *
00018  *   along with this program; if not, write to the                         *
00019  *   Free Software Foundation, Inc.,                                       *
00020  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
00021  ***************************************************************************
00022  *                                                                         *
00023  *  DESCRIPTION                                                            *
00024  *                                                                         *
00025  *   $Log: qlearning.h,v $
00026  *   Revision 1.6  2008/05/02 17:20:04  martius
00027  *   *** empty log message ***
00028  *
00029  *   Revision 1.5  2008/04/17 14:54:45  martius
00030  *   randomGen added, which is a random generator with long period and an
00031  *    internal state. Each Agent has an instance and passed it to the controller
00032  *    and the wiring. This is good for
00033  *   a) repeatability on agent basis,
00034  *   b) parallel execution as done in ode_robots
00035  *
00036  *   Revision 1.4  2008/02/29 20:38:46  martius
00037  *   reward in better averaged, but uses more memory
00038  *
00039  *   Revision 1.3  2007/08/24 12:05:34  martius
00040  *   sarsa optional
00041  *   randomQ initialsasation optional
00042  *
00043  *   Revision 1.2  2007/07/16 20:25:54  martius
00044  *   works now
00045  *   features:
00046  *    eligibility trace, meaning that updates are also done for
00047  *     past state/action pairs
00048  *    random work in case of no knowledge
00049  *
00050  *   Revision 1.1  2007/07/16 08:50:03  martius
00051  *   first, but not yet working implementation of q-learning
00052  *
00053  *
00054  *                                                                 *
00055  ***************************************************************************/
00056 #ifndef __QLEARNING_H
00057 #define __QLEARNING_H
00058 
00059 #include "matrix.h"
00060 #include "configurable.h"
00061 #include "storeable.h"
00062 #include "randomgenerator.h"
00063 
00064 /// implements QLearning
00065 class QLearning : public Configurable, public Storeable {
00066 public: 
00067   /**
00068      \param eps learning rate (typically 0.1)
00069      \param discount discount factor for Q-values (typically 0.9)
00070      \param exploration exploration rate (typically 0.02)
00071      \param eligibility number of steps to update backwards in time
00072      \param random_initQ if true Q table is filled with small random numbers at the start (default: false)
00073      \param useSARSA if true, use SARSA strategy otherwise qlearning (default: false)
00074      \param tau number of time steps to average over reward for col_rew 
00075    */ 
00076   QLearning(double eps, double discount, double exploration, int eligibility, 
00077             bool random_initQ = false, bool useSARSA = false, int tau=1000); 
00078   
00079   virtual ~QLearning();
00080 
00081   /** initialisation with the given number of action and states
00082       @param actionDim number of actions
00083       @param stateDim number of states
00084       @param unit_map if 0 the parametes are choosen randomly. 
00085       Otherwise the model is initialised to represent a unit_map with the given response strength.
00086   */
00087   virtual void init(unsigned  int stateDim, unsigned int actionDim, RandGen* randGen = 0);
00088 
00089   /** selection of action given current state.
00090       The policy is to take the actions with the highest value,
00091       or a random action at the rate of exploration
00092   */
00093   virtual unsigned int select (unsigned int state);
00094 
00095   /** selection of action given current state.
00096       The policy is to sample from the above average actions, with bias
00097       to the old action (also exploration included).
00098   */
00099   virtual unsigned int select_sample (unsigned int state);
00100   /// select with preference to old (90% if good) and 30% second best
00101   virtual unsigned int select_keepold (unsigned int state);
00102 
00103   /* performs learning and returns current expected reward.
00104      \param state current state
00105      \param action we select in current state
00106      \param reward reinforcement we obtain in this state
00107      \param learnRateFactor can be given to modify eps for this
00108      learning step
00109   */
00110   virtual double learn (unsigned int state, 
00111                         unsigned int action,
00112                         double reward, 
00113                         double learnRateFactor = 1);
00114 
00115   /** returns the vector of values for all actions given the current state
00116    */
00117   matrix::Matrix getActionValues(unsigned int state);
00118 
00119 
00120   /** tells the q learning that the agent was reset, so that it
00121       forgets it memory. please note, that updating the Q-table is
00122       one step later, so in case of a reward you should call learn one
00123       more time before reset.
00124   */
00125   virtual void reset();
00126 
00127 
00128   /// returns the number of states
00129   virtual unsigned int getStateDim() const;
00130   /// returns the number of actions
00131   virtual unsigned int getActionDim() const;
00132 
00133   /// returns the collectedReward reward
00134   virtual double getCollectedReward() const;
00135 
00136   /// expects a list of value,range and returns the associated state
00137   static int valInCrossProd(const std::list<std::pair<int,int> >& vals);
00138   
00139   /// expects a list of ranges and a state/action and return the configuration
00140   static std::list<int> ConfInCrossProd(const std::list<int>& ranges, int val);
00141 
00142   /// returns q table (mxn) == (states x actions)
00143   virtual const matrix::Matrix& getQ() const {return Q;} ;
00144 
00145   virtual bool store(FILE* f) const;
00146     
00147   virtual bool restore(FILE* f);  
00148 
00149 
00150 protected:
00151   double eps;
00152   double discount;
00153   double exploration;
00154   double eligibility; // is used as integer (only for configration)
00155   bool random_initQ;
00156 public:
00157   bool useSARSA; ///< if true, use SARSA strategy otherwise qlearning
00158 protected:
00159   int tau;       ///< time horizont for averaging the reward
00160   matrix::Matrix Q; /// < Q table (mxn) == (states x actions)
00161 
00162 
00163   int* actions;    // ring buffer for actions
00164   int* states;     // ring buffer for states
00165   double* rewards; // ring buffer for rewards
00166   int ringbuffersize; // size of ring buffers, eligibility + 1
00167   double* longrewards; // long ring buffer for rewards for collectedReward
00168   int t; // time for ring buffers
00169   bool initialised; 
00170   double collectedReward; // sum over collected reward
00171 
00172   RandGen* randGen;
00173 };
00174 
00175 
00176 #endif