00001 /*************************************************************************** 00002 * Copyright (C) 2005-2011 LpzRobots development team * 00003 * Georg Martius <georg dot martius at web dot de> * 00004 * Frank Guettler <guettler at informatik dot uni-leipzig dot de * 00005 * Frank Hesse <frank at nld dot ds dot mpg dot de> * 00006 * Ralf Der <ralfder at mis dot mpg dot de> * 00007 * * 00008 * This program is free software; you can redistribute it and/or modify * 00009 * it under the terms of the GNU General Public License as published by * 00010 * the Free Software Foundation; either version 2 of the License, or * 00011 * (at your option) any later version. * 00012 * * 00013 * This program is distributed in the hope that it will be useful, * 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 00016 * GNU General Public License for more details. * 00017 * * 00018 * You should have received a copy of the GNU General Public License * 00019 * along with this program; if not, write to the * 00020 * Free Software Foundation, Inc., * 00021 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 00022 * * 00023 ***************************************************************************/ 00024 #ifndef __QLEARNING_H 00025 #define __QLEARNING_H 00026 00027 #include "matrix.h" 00028 #include "configurable.h" 00029 #include "storeable.h" 00030 #include "randomgenerator.h" 00031 00032 /// implements QLearning 00033 class QLearning : public Configurable, public Storeable { 00034 public: 00035 /** 00036 \param eps learning rate (typically 0.1) 00037 \param discount discount factor for Q-values (typically 0.9) 00038 \param exploration exploration rate (typically 0.02) 00039 \param eligibility number of steps to update backwards in time 00040 \param random_initQ if true Q table is filled with small random numbers at the start (default: false) 00041 \param useSARSA if true, use SARSA strategy otherwise qlearning (default: false) 00042 \param tau number of time steps to average over reward for col_rew 00043 */ 00044 QLearning(double eps, double discount, double exploration, int eligibility, 00045 bool random_initQ = false, bool useSARSA = false, int tau=1000); 00046 00047 virtual ~QLearning(); 00048 00049 /** initialisation with the given number of action and states 00050 @param actionDim number of actions 00051 @param stateDim number of states 00052 @param unit_map if 0 the parametes are choosen randomly. 00053 Otherwise the model is initialised to represent a unit_map with the given response strength. 00054 */ 00055 virtual void init(unsigned int stateDim, unsigned int actionDim, RandGen* randGen = 0); 00056 00057 /** selection of action given current state. 00058 The policy is to take the actions with the highest value, 00059 or a random action at the rate of exploration 00060 */ 00061 virtual unsigned int select (unsigned int state); 00062 00063 /** selection of action given current state. 00064 The policy is to sample from the above average actions, with bias 00065 to the old action (also exploration included). 00066 */ 00067 virtual unsigned int select_sample (unsigned int state); 00068 /// select with preference to old (90% if good) and 30% second best 00069 virtual unsigned int select_keepold (unsigned int state); 00070 00071 /* performs learning and returns current expected reward. 00072 \param state current state 00073 \param action we select in current state 00074 \param reward reinforcement we obtain in this state 00075 \param learnRateFactor can be given to modify eps for this 00076 learning step 00077 */ 00078 virtual double learn (unsigned int state, 00079 unsigned int action, 00080 double reward, 00081 double learnRateFactor = 1); 00082 00083 /** returns the vector of values for all actions given the current state 00084 */ 00085 matrix::Matrix getActionValues(unsigned int state); 00086 00087 00088 /** tells the q learning that the agent was reset, so that it 00089 forgets it memory. please note, that updating the Q-table is 00090 one step later, so in case of a reward you should call learn one 00091 more time before reset. 00092 */ 00093 virtual void reset(); 00094 00095 00096 /// returns the number of states 00097 virtual unsigned int getStateDim() const; 00098 /// returns the number of actions 00099 virtual unsigned int getActionDim() const; 00100 00101 /// returns the collectedReward reward 00102 virtual double getCollectedReward() const; 00103 00104 /// expects a list of value,range and returns the associated state 00105 static int valInCrossProd(const std::list<std::pair<int,int> >& vals); 00106 00107 /// expects a list of ranges and a state/action and return the configuration 00108 static std::list<int> ConfInCrossProd(const std::list<int>& ranges, int val); 00109 00110 /// returns q table (mxn) == (states x actions) 00111 virtual const matrix::Matrix& getQ() const {return Q;} ; 00112 00113 virtual bool store(FILE* f) const; 00114 00115 virtual bool restore(FILE* f); 00116 00117 00118 protected: 00119 double eps; 00120 double discount; 00121 double exploration; 00122 double eligibility; // is used as integer (only for configration) 00123 bool random_initQ; 00124 public: 00125 bool useSARSA; ///< if true, use SARSA strategy otherwise qlearning 00126 protected: 00127 int tau; ///< time horizont for averaging the reward 00128 matrix::Matrix Q; /// < Q table (mxn) == (states x actions) 00129 00130 00131 int* actions; // ring buffer for actions 00132 int* states; // ring buffer for states 00133 double* rewards; // ring buffer for rewards 00134 int ringbuffersize; // size of ring buffers, eligibility + 1 00135 double* longrewards; // long ring buffer for rewards for collectedReward 00136 int t; // time for ring buffers 00137 bool initialised; 00138 double collectedReward; // sum over collected reward 00139 00140 RandGen* randGen; 00141 }; 00142 00143 00144 #endif