Ejemplo n.º 1
0
int ANN_Policy::SelectAction (real* s, real r, int forced_a)
{
	int a; // selected action
	int amax; //maximum evaluated action
	real* Q_s; // pointer to evaluations for state s
	if (confidence) {
		if (separate_actions) {
			for (int i=0; i<n_actions; i++) {
				ANN_StochasticInput (Ja[i], s);
				JQs[i] = ANN_GetOutput(Ja[i])[0];
			}
			Q_s = JQs;
		} else {
			ANN_StochasticInput (J, s);
			Q_s = ANN_GetOutput (J);
		}
	} else {
		if (separate_actions) {
			for (int i=0; i<n_actions; i++) {
				ANN_Input (Ja[i], s);
				JQs[i] = ANN_GetOutput(Ja[i])[0];
			}
			Q_s = JQs;
		} else {
			ANN_Input (J, s);
			Q_s = ANN_GetOutput (J);
		}
	}
	int argmax = argMax (Q_s);

	if (forced_learning) {
		a = forced_a;
	} else if (confidence) {
		a = argmax;
	} else if (smax) {
		a = softMax (Q_s);
		//printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
	} else {
		a = eGreedy (Q_s);
	}

	if (a<0 || a>=n_actions) {
		fprintf (stderr, "Action %d out of bounds\n", a);
	}

	switch (learning_method) {
		
	case Sarsa: 
		amax = a;
		break;
	case QLearning:
		amax = argmax;
		break;
	default:
		amax = a;
		fprintf (stderr, "Unknown learning method\n");
	}
	if (pa>=0) { // do not update at start of episode
		real delta = r + gamma*Q_s[amax] - J_ps_pa;
		tdError = delta;
		for (int j=0; j<n_actions; j++) {
			delta_vector[j] = 0.0;
		}
		if (separate_actions) {
			if (eligibility) {
				delta_vector[0] = 1.0;
				ANN_Delta_Train (Ja[pa], delta_vector, delta);
				// Reset other actions' traces.
				for (int i=0; i<n_actions; i++) {
					if (i!=pa) {
						ANN_Reset(Ja[i]);
					}
				}
			} else {
				delta_vector[0] = delta;
				ANN_Delta_Train (Ja[pa], delta_vector, 0.0);
			}
		} else {
			if (J->eligibility_traces) {
				delta_vector[pa] = 1.0;
				ANN_Delta_Train (J, delta_vector, delta);
			} else {
				delta_vector[pa] = delta;
				ANN_Delta_Train (J, delta_vector, 0.0);
			}
		}


	}

	//printf ("%d %d #STATE\n", min_el_state, max_el_state);
	//	printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
	//			ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);

	J_ps_pa = Q_s[a];
	pa = a;

	return a;
}
Ejemplo n.º 2
0
/** Select an action a, given state s and reward from previous action.

   Optional argument a forces an action if setForcedLearning() has
   been called with true.

   Two algorithms are implemented, both of which converge. One of them
   calculates the value of the current policy, while the other that of
   the optimal policy.

   Sarsa (\f$\lambda\f$) algorithmic description:

   1. Take action \f$a\f$, observe \f$r, s'\f$

   2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$

   3. \f$\delta = r + \gamma Q(s',a') - Q(s,a)\f$

   4. \f$e(s,a) = e(s,a)+ 1\f$, depending on trace settings

   5. for all \f$s,a\f$ :
   \f[
   Q_{t}(s,a) = Q_{t-1}(s,a) + \alpha \delta e_{t}(s,a),
   \f]
where \f$e_{t}(s,a) = \gamma \lambda e_{t-1}(s,a)\f$

	  end

   6. \f$a = a'\f$ (we will take this action at the next step)

   7. \f$s = s'\f$

   Watkins Q (l) algorithmic description:

   1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$

   2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$

   3. \f$a* = \arg \max_b Q(s',b)\f$

   3. \f$\delta = r + \gamma Q(s',a^*) - Q(s,a)\f$

   4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces

   5. for all \f$s,a\f$ :
\f[
        Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
\f]
		if \f$(a'=a*)\f$ then \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a)\f$
		           else \f$e(s,a) = 0\f$
	  end

   6. \f$a = a'\f$ (we will take this action at the next step)

   7. \f$s = s'\f$

   The most general algorithm is E-learning, currently under
   development, which is defined as follows:

   1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$

   2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$

   3. \f$\delta = r + \gamma E{Q(s',a^*)|\pi} - Q(s,a)\f$

   4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces

   5. for all \f$s,a\f$ :
\f[
        Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
\f]
		\f$e(s,a)\f$ = \f$\gamma \lambda e(s,a) P(a|s,\pi) \f$

   6. \f$a = a'\f$ (we will take this action at the next step)

   7. \f$s = s'\f$

   Note that we also cut off the eligibility traces that have fallen below 0.1


*/
int DiscretePolicy::SelectAction (int s, real r, int forced_a)
{
    if ((s<0)||(s>=n_states)) {
        return 0;
    }

    if ((ps>=0)&&(pa>=0)) {
        expected_r += r;
        expected_V += Q[ps][pa];
        n_samples++;

        if (s==0) {
            real max_estimate = 0.0;
            real max_estimate_k = 0.0;
            for (int i=0; i<n_states; i++) {
                max_estimate += Q[i][argMax (Q[i])];
                max_estimate_k += 1.0;
            }

#if 0
            logmsg ("%f %f %f %f#rTVV\n",
                    expected_r/((real) n_samples),
                    temp,
                    expected_V/((real) n_samples),
                    max_estimate/max_estimate_k);
#endif
            expected_r = 0.0;
            expected_V= 0.0;
            n_samples = 0;
        }
    }
    int a, amax;
    int argmax = argMax (Q[s]);

    P[s][argmax] += zeta*(1.0f-P[s][argmax]);
    for (int j=0; j<n_actions; j++) {
        if (j!=argmax) {
            P[s][j] += zeta*(0.0f-P[s][j]);
        }
    }



    if (forced_learning) {
        a = forced_a;
    } else if (pursuit) {
        real sum = 0.0;
        a = -1;
        int j;
        for (j=0; j<n_actions; j++) {
            sum += P[s][j];
        }
        real X = urandom()*sum;
        real dsum=0.0;
        for (j=0; j<n_actions; j++) {
            dsum += P[s][j];
            if (X<=dsum) {
                a = j;
                break;
            }
        }
        if (a==-1) {
            fprintf (stderr, "No action selected with pursuit!\n");
        }
    } else if (confidence) {
        if (confidence_uses_gibbs && (confidence_distribution == SINGULAR)) {
            a = confMax (Q[s],vQ[s]);
        } else {
            a = confSample (Q[s], vQ[s]);
            if (confidence_uses_gibbs) { // and not SINGULAR distribution
                a = softMax(sample); //use softmax on the sample values
            }
        }
    } else if (reliability_estimate) {
        temp = sqrt(Sum(vQ[s], n_actions)/((real) n_actions));
        //temp = 0.1;
        a = softMax(Q[s]);
        //printf ("%f\n", temp);
    } else if (smax) {
        a = softMax (Q[s]);
        //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
    } else {
        a = eGreedy (Q[s]);
    }

    if (a<0 || a>=n_actions) {
        fprintf (stderr, "Action %d out of bounds.. ", a);
        a = (int) floor (urandom()*((real) n_actions));
        fprintf (stderr, "mapping to %d\n", a);
    }

    real EQ_s = 0.0;
    int i;

    switch (learning_method) {

    case Sarsa:
        amax = a;
        EQ_s = Q[s][amax];
        break;
    case QLearning:
        amax = argmax;
        EQ_s = Q[s][amax];
        break;
    case ELearning:
        amax = a; //? correct ?
        Normalise(eval, eval, n_actions);
        EQ_s = 0.0;
        for (i=0; i<n_actions; i++) {
            EQ_s += eval[i] * Q[s][i];
        }
        break;
    default:
        amax = a;
        EQ_s = Q[s][amax];
        fprintf (stderr, "Unknown learning method\n");
    }
    if ((ps>=0)&&(pa>=0)) { // do not update at start of episode
        real delta = r + gamma*EQ_s - Q[ps][pa];
        tdError = delta;
        if (replacing_traces) {
            e[ps][pa] = 1.0;
        } else {
            e[ps][pa] += 1.0;
        }
        real ad = alpha*delta;
        real gl = gamma * lambda;
        real variance_threshold = 0.0001f;
        if  (confidence_eligibility == false) {
            vQ[ps][pa] = (1.0f - zeta)*vQ[ps][pa] + zeta*(ad*ad);
            if (vQ[ps][pa]<variance_threshold) {
                vQ[ps][pa]=variance_threshold;
            }
        }
        if (ps<min_el_state) min_el_state = ps;
        if (ps>max_el_state) max_el_state = ps;


        for (i=0; i<n_states; i++) {
            //for (int i=min_el_state; i<=max_el_state; i++) {
            bool el=true;
            for (int j=0; j<n_actions; j++) {
                if (e[i][j]>0.01) {
                    Q[i][j] += ad * e[i][j];
                    if (confidence_eligibility == true) {
                        real zeta_el = zeta * e[i][j];
                        vQ[i][j] = (1.0f - zeta_el)*vQ[i][j] + zeta_el*(ad*ad);
                        if (vQ[i][j]<variance_threshold) {
                            vQ[i][j]=variance_threshold;
                        }
                    }
                    //this is the same as setting e[ps][pa] += (1-P[ps][pa])
                    // if P[][] remains unchanged between updates.
                    // -- removed because it doesn't work! --
                    //P[i][j] += 0.01*delta * e[i][j] * (1.-P[i][j]);
                    if ((fabs (Q[i][j])>1000.0)||(isnan(Q[i][j]))) {
                        printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]);
                    }

                    //This is only needed for Qlearning, but sarsa is not
                    //affected since always amax==a;
                    if (amax==a) {
                        e[i][j] *= gl;
                    } else {
                        e[i][j] = 0.0;
                    }
                } else {
                    e[i][j] = 0.0;
                    el = false;
                }
            }
            if (el==false) {
                if (min_el_state==i)
                    min_el_state++;
            } else {
                max_el_state = i;
            }
        }
    }

    //printf ("%d %d #STATE\n", min_el_state, max_el_state);
    //	printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
    //			ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);
    ps = s;
    pa = a;

    return a;
}