00001 /*************************************************************************** 00002 * Copyright (C) 2005 by Robot Group Leipzig * 00003 * martius@informatik.uni-leipzig.de * 00004 * fhesse@informatik.uni-leipzig.de * 00005 * der@informatik.uni-leipzig.de * 00006 * * 00007 * This program is free software; you can redistribute it and/or modify * 00008 * it under the terms of the GNU General Public License as published by * 00009 * the Free Software Foundation; either version 2 of the License, or * 00010 * (at your option) any later version. * 00011 * * 00012 * This program is distributed in the hope that it will be useful, * 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 00015 * GNU General Public License for more details. * 00016 * * 00017 * You should have received a copy of the GNU General Public License * 00018 * along with this program; if not, write to the * 00019 * Free Software Foundation, Inc., * 00020 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 00021 *************************************************************************** 00022 * * 00023 * DESCRIPTION * 00024 * * 00025 * $Log: qlearning.h,v $ 00026 * Revision 1.6 2008/05/02 17:20:04 martius 00027 * *** empty log message *** 00028 * 00029 * Revision 1.5 2008/04/17 14:54:45 martius 00030 * randomGen added, which is a random generator with long period and an 00031 * internal state. Each Agent has an instance and passed it to the controller 00032 * and the wiring. This is good for 00033 * a) repeatability on agent basis, 00034 * b) parallel execution as done in ode_robots 00035 * 00036 * Revision 1.4 2008/02/29 20:38:46 martius 00037 * reward in better averaged, but uses more memory 00038 * 00039 * Revision 1.3 2007/08/24 12:05:34 martius 00040 * sarsa optional 00041 * randomQ initialsasation optional 00042 * 00043 * Revision 1.2 2007/07/16 20:25:54 martius 00044 * works now 00045 * features: 00046 * eligibility trace, meaning that updates are also done for 00047 * past state/action pairs 00048 * random work in case of no knowledge 00049 * 00050 * Revision 1.1 2007/07/16 08:50:03 martius 00051 * first, but not yet working implementation of q-learning 00052 * 00053 * 00054 * * 00055 ***************************************************************************/ 00056 #ifndef __QLEARNING_H 00057 #define __QLEARNING_H 00058 00059 #include "matrix.h" 00060 #include "configurable.h" 00061 #include "storeable.h" 00062 #include "randomgenerator.h" 00063 00064 /// implements QLearning 00065 class QLearning : public Configurable, public Storeable { 00066 public: 00067 /** 00068 \param eps learning rate (typically 0.1) 00069 \param discount discount factor for Q-values (typically 0.9) 00070 \param exploration exploration rate (typically 0.02) 00071 \param eligibility number of steps to update backwards in time 00072 \param random_initQ if true Q table is filled with small random numbers at the start (default: false) 00073 \param useSARSA if true, use SARSA strategy otherwise qlearning (default: false) 00074 \param tau number of time steps to average over reward for col_rew 00075 */ 00076 QLearning(double eps, double discount, double exploration, int eligibility, 00077 bool random_initQ = false, bool useSARSA = false, int tau=1000); 00078 00079 virtual ~QLearning(); 00080 00081 /** initialisation with the given number of action and states 00082 @param actionDim number of actions 00083 @param stateDim number of states 00084 @param unit_map if 0 the parametes are choosen randomly. 00085 Otherwise the model is initialised to represent a unit_map with the given response strength. 00086 */ 00087 virtual void init(unsigned int stateDim, unsigned int actionDim, RandGen* randGen = 0); 00088 00089 /** selection of action given current state. 00090 The policy is to take the actions with the highest value, 00091 or a random action at the rate of exploration 00092 */ 00093 virtual unsigned int select (unsigned int state); 00094 00095 /** selection of action given current state. 00096 The policy is to sample from the above average actions, with bias 00097 to the old action (also exploration included). 00098 */ 00099 virtual unsigned int select_sample (unsigned int state); 00100 /// select with preference to old (90% if good) and 30% second best 00101 virtual unsigned int select_keepold (unsigned int state); 00102 00103 /* performs learning and returns current expected reward. 00104 \param state current state 00105 \param action we select in current state 00106 \param reward reinforcement we obtain in this state 00107 \param learnRateFactor can be given to modify eps for this 00108 learning step 00109 */ 00110 virtual double learn (unsigned int state, 00111 unsigned int action, 00112 double reward, 00113 double learnRateFactor = 1); 00114 00115 /** returns the vector of values for all actions given the current state 00116 */ 00117 matrix::Matrix getActionValues(unsigned int state); 00118 00119 00120 /** tells the q learning that the agent was reset, so that it 00121 forgets it memory. please note, that updating the Q-table is 00122 one step later, so in case of a reward you should call learn one 00123 more time before reset. 00124 */ 00125 virtual void reset(); 00126 00127 00128 /// returns the number of states 00129 virtual unsigned int getStateDim() const; 00130 /// returns the number of actions 00131 virtual unsigned int getActionDim() const; 00132 00133 /// returns the collectedReward reward 00134 virtual double getCollectedReward() const; 00135 00136 /// expects a list of value,range and returns the associated state 00137 static int valInCrossProd(const std::list<std::pair<int,int> >& vals); 00138 00139 /// expects a list of ranges and a state/action and return the configuration 00140 static std::list<int> ConfInCrossProd(const std::list<int>& ranges, int val); 00141 00142 /// returns q table (mxn) == (states x actions) 00143 virtual const matrix::Matrix& getQ() const {return Q;} ; 00144 00145 virtual bool store(FILE* f) const; 00146 00147 virtual bool restore(FILE* f); 00148 00149 00150 protected: 00151 double eps; 00152 double discount; 00153 double exploration; 00154 double eligibility; // is used as integer (only for configration) 00155 bool random_initQ; 00156 public: 00157 bool useSARSA; ///< if true, use SARSA strategy otherwise qlearning 00158 protected: 00159 int tau; ///< time horizont for averaging the reward 00160 matrix::Matrix Q; /// < Q table (mxn) == (states x actions) 00161 00162 00163 int* actions; // ring buffer for actions 00164 int* states; // ring buffer for states 00165 double* rewards; // ring buffer for rewards 00166 int ringbuffersize; // size of ring buffers, eligibility + 1 00167 double* longrewards; // long ring buffer for rewards for collectedReward 00168 int t; // time for ring buffers 00169 bool initialised; 00170 double collectedReward; // sum over collected reward 00171 00172 RandGen* randGen; 00173 }; 00174 00175 00176 #endif