int DiscretePolicy::eGreedy(real* Qs) { real X = urandom(); int amax = argMax(Qs); real base_prob = temp/((real) n_actions); for (int a=0; a<n_actions; a++) { eval[a] = base_prob; } eval[amax] += 1.0f-temp; if (X<temp) { return rand()%n_actions; } return argMax(Qs); }
int DiscretePolicy::confSample(real* Qs, real* vQs) { static NormalDistribution gaussian; static LaplacianDistribution laplacian; static UniformDistribution uniform; for (int a=0; a<n_actions; a++) { //eval[a] = Qs[a] + urandom(-1.0,1.0)*vQs[a]; switch(confidence_distribution) { case SINGULAR: sample[a] = Qs[a]; break; case BOUNDED: uniform.setMean(Qs[a]); uniform.setVariance(vQs[a]); sample[a] = uniform.generate(); break; case GAUSSIAN: gaussian.setMean(Qs[a]); gaussian.setVariance(vQs[a]); sample[a] = gaussian.generate(); break; case LAPLACIAN: laplacian.setMean(Qs[a]); laplacian.setVariance(vQs[a]); sample[a] = Qs[a] + laplacian.generate(); break; default: Serror ("Unknown distribution ID:%d\n", confidence_distribution); break; } } return argMax(sample); }
void argMaxReduce(int lid, size_t lsize, __local float *values, __local int *indices){ for(int offset = lsize/2; offset > 0; offset >>= 1){ if(lid < offset){ argMax(values[lid], indices[lid], values[lid + offset], indices[lid + offset], values + lid, indices + lid); } barrier(CLK_LOCAL_MEM_FENCE); } }
void *routeAnt(void *id) { int j, i = (int)id; for(j=1;j<ins.dimension;j++){ if(hasCandidatesLeft(i,ant[i].tour[j-1])) moveAntTo(i,j,argMax(i,ant[i].tour[j-1])); else moveAntTo(i,j,NN(i,j-1)); } }
/// Delete policy. DiscretePolicy::~DiscretePolicy() { real sum = 0.0; FILE* f = fopen ("/tmp/discrete","wb"); int s; for (s=0; s<n_states; s++) { sum += Q[s][argMax(Q[s])]; if (f) { //softMax(Q[s]); real sum2=0.0; int a; for (a=0; a<n_actions; a++) { sum2 += eval[a]; } for (a=0; a<n_actions; a++) { fprintf (f, "%f ", Q[s][a]); } for (a=0; a<n_actions; a++) { fprintf (f, "%f ", P[s][a]); } for (a=0; a<n_actions; a++) { fprintf (f, "%f ", vQ[s][a]); } fprintf (f, "\n"); } } if (f) { fclose (f); } logmsg ("#Expected return of greedy policy over random distribution of states: %f\n", sum/((real) n_states)); for (s=0; s<n_states; s++) { delete [] P[s]; delete [] Q[s]; delete [] e[s]; delete [] vQ[s]; } delete [] P; delete [] Q; delete [] vQ; delete [] e; delete [] eval; delete [] sample; }
int ANN_Policy::SelectAction (real* s, real r, int forced_a) { int a; // selected action int amax; //maximum evaluated action real* Q_s; // pointer to evaluations for state s if (confidence) { if (separate_actions) { for (int i=0; i<n_actions; i++) { ANN_StochasticInput (Ja[i], s); JQs[i] = ANN_GetOutput(Ja[i])[0]; } Q_s = JQs; } else { ANN_StochasticInput (J, s); Q_s = ANN_GetOutput (J); } } else { if (separate_actions) { for (int i=0; i<n_actions; i++) { ANN_Input (Ja[i], s); JQs[i] = ANN_GetOutput(Ja[i])[0]; } Q_s = JQs; } else { ANN_Input (J, s); Q_s = ANN_GetOutput (J); } } int argmax = argMax (Q_s); if (forced_learning) { a = forced_a; } else if (confidence) { a = argmax; } else if (smax) { a = softMax (Q_s); //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]); } else { a = eGreedy (Q_s); } if (a<0 || a>=n_actions) { fprintf (stderr, "Action %d out of bounds\n", a); } switch (learning_method) { case Sarsa: amax = a; break; case QLearning: amax = argmax; break; default: amax = a; fprintf (stderr, "Unknown learning method\n"); } if (pa>=0) { // do not update at start of episode real delta = r + gamma*Q_s[amax] - J_ps_pa; tdError = delta; for (int j=0; j<n_actions; j++) { delta_vector[j] = 0.0; } if (separate_actions) { if (eligibility) { delta_vector[0] = 1.0; ANN_Delta_Train (Ja[pa], delta_vector, delta); // Reset other actions' traces. for (int i=0; i<n_actions; i++) { if (i!=pa) { ANN_Reset(Ja[i]); } } } else { delta_vector[0] = delta; ANN_Delta_Train (Ja[pa], delta_vector, 0.0); } } else { if (J->eligibility_traces) { delta_vector[pa] = 1.0; ANN_Delta_Train (J, delta_vector, delta); } else { delta_vector[pa] = delta; ANN_Delta_Train (J, delta_vector, 0.0); } } } //printf ("%d %d #STATE\n", min_el_state, max_el_state); // printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n", // ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl); J_ps_pa = Q_s[a]; pa = a; return a; }
/// Load policy from a file. void DiscretePolicy::loadFile (char* f) { FILE* fh = NULL; size_t readSize; fh = fopen (f, "rb"); if (fh==NULL) { fprintf (stderr, "Failed to read file %s\n", f); return; } char rtag[256]; const char* start_tag="QSA"; const char* close_tag="END"; int n_read_states, n_read_actions; readSize = fread((void *) rtag, sizeof (char), strlen (start_tag)+1, fh); if( readSize < strlen(start_tag)+1 ) fprintf(stderr, "Error when reading file"); if (strcmp (rtag, start_tag)) { fprintf (stderr, "Could not find starting tag\n"); return; } readSize = fread((void *) &n_read_states, sizeof(int), 1, fh); if( readSize < 1 ) fprintf(stderr, "Error when reading file"); readSize = fread((void *) &n_read_actions, sizeof(int), 1, fh); if( readSize < 1 ) fprintf(stderr, "Error when reading file"); if ((n_read_states!=n_states)||(n_read_actions!=n_actions)) { fprintf (stderr, "File has %dx%d space! Aborting read.\n", n_read_states, n_read_actions); fclose(fh); return; } int i, j; for (i=0; i<n_states; i++) { readSize = fread((void *) Q[i], sizeof(real), n_actions, fh); if( readSize < (int unsigned)n_actions ) fprintf(stderr, "Error when reading file"); for (j=0; j<n_actions; j++) { if ((fabs (Q[i][j])>100.0)||(isnan(Q[i][j]))) { printf ("l: %d %d %f\n", i,j,Q[i][j]); Q[i][j] = 0.0; } } } for (i=0; i<n_states; i++) { for (j=0; j<n_actions; j++) { { P[i][j] = 1.0f/((real) n_actions); } } int argmax = argMax (Q[i]); P[i][argmax] += 0.001f*(1.0f-P[i][argmax]); for (int j=0; j<n_actions; j++) { if (j!=argmax) { P[i][j] += 0.001f*(0.0f-P[i][j]); } } } readSize = fread((void *) rtag, sizeof (char), strlen (close_tag)+1, fh); if( readSize < strlen(close_tag)+1 ) fprintf(stderr, "Error when reading file"); if (strcmp (rtag, close_tag)) { fprintf (stderr, "Could not find ending tag\n"); fclose (fh); return; } fclose (fh); }
/** Select an action a, given state s and reward from previous action. Optional argument a forces an action if setForcedLearning() has been called with true. Two algorithms are implemented, both of which converge. One of them calculates the value of the current policy, while the other that of the optimal policy. Sarsa (\f$\lambda\f$) algorithmic description: 1. Take action \f$a\f$, observe \f$r, s'\f$ 2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$ 3. \f$\delta = r + \gamma Q(s',a') - Q(s,a)\f$ 4. \f$e(s,a) = e(s,a)+ 1\f$, depending on trace settings 5. for all \f$s,a\f$ : \f[ Q_{t}(s,a) = Q_{t-1}(s,a) + \alpha \delta e_{t}(s,a), \f] where \f$e_{t}(s,a) = \gamma \lambda e_{t-1}(s,a)\f$ end 6. \f$a = a'\f$ (we will take this action at the next step) 7. \f$s = s'\f$ Watkins Q (l) algorithmic description: 1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$ 2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$ 3. \f$a* = \arg \max_b Q(s',b)\f$ 3. \f$\delta = r + \gamma Q(s',a^*) - Q(s,a)\f$ 4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces 5. for all \f$s,a\f$ : \f[ Q(s,a) = Q(s,a)+\alpha \delta e(s,a) \f] if \f$(a'=a*)\f$ then \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a)\f$ else \f$e(s,a) = 0\f$ end 6. \f$a = a'\f$ (we will take this action at the next step) 7. \f$s = s'\f$ The most general algorithm is E-learning, currently under development, which is defined as follows: 1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$ 2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$ 3. \f$\delta = r + \gamma E{Q(s',a^*)|\pi} - Q(s,a)\f$ 4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces 5. for all \f$s,a\f$ : \f[ Q(s,a) = Q(s,a)+\alpha \delta e(s,a) \f] \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a) P(a|s,\pi) \f$ 6. \f$a = a'\f$ (we will take this action at the next step) 7. \f$s = s'\f$ Note that we also cut off the eligibility traces that have fallen below 0.1 */ int DiscretePolicy::SelectAction (int s, real r, int forced_a) { if ((s<0)||(s>=n_states)) { return 0; } if ((ps>=0)&&(pa>=0)) { expected_r += r; expected_V += Q[ps][pa]; n_samples++; if (s==0) { real max_estimate = 0.0; real max_estimate_k = 0.0; for (int i=0; i<n_states; i++) { max_estimate += Q[i][argMax (Q[i])]; max_estimate_k += 1.0; } #if 0 logmsg ("%f %f %f %f#rTVV\n", expected_r/((real) n_samples), temp, expected_V/((real) n_samples), max_estimate/max_estimate_k); #endif expected_r = 0.0; expected_V= 0.0; n_samples = 0; } } int a, amax; int argmax = argMax (Q[s]); P[s][argmax] += zeta*(1.0f-P[s][argmax]); for (int j=0; j<n_actions; j++) { if (j!=argmax) { P[s][j] += zeta*(0.0f-P[s][j]); } } if (forced_learning) { a = forced_a; } else if (pursuit) { real sum = 0.0; a = -1; int j; for (j=0; j<n_actions; j++) { sum += P[s][j]; } real X = urandom()*sum; real dsum=0.0; for (j=0; j<n_actions; j++) { dsum += P[s][j]; if (X<=dsum) { a = j; break; } } if (a==-1) { fprintf (stderr, "No action selected with pursuit!\n"); } } else if (confidence) { if (confidence_uses_gibbs && (confidence_distribution == SINGULAR)) { a = confMax (Q[s],vQ[s]); } else { a = confSample (Q[s], vQ[s]); if (confidence_uses_gibbs) { // and not SINGULAR distribution a = softMax(sample); //use softmax on the sample values } } } else if (reliability_estimate) { temp = sqrt(Sum(vQ[s], n_actions)/((real) n_actions)); //temp = 0.1; a = softMax(Q[s]); //printf ("%f\n", temp); } else if (smax) { a = softMax (Q[s]); //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]); } else { a = eGreedy (Q[s]); } if (a<0 || a>=n_actions) { fprintf (stderr, "Action %d out of bounds.. ", a); a = (int) floor (urandom()*((real) n_actions)); fprintf (stderr, "mapping to %d\n", a); } real EQ_s = 0.0; int i; switch (learning_method) { case Sarsa: amax = a; EQ_s = Q[s][amax]; break; case QLearning: amax = argmax; EQ_s = Q[s][amax]; break; case ELearning: amax = a; //? correct ? Normalise(eval, eval, n_actions); EQ_s = 0.0; for (i=0; i<n_actions; i++) { EQ_s += eval[i] * Q[s][i]; } break; default: amax = a; EQ_s = Q[s][amax]; fprintf (stderr, "Unknown learning method\n"); } if ((ps>=0)&&(pa>=0)) { // do not update at start of episode real delta = r + gamma*EQ_s - Q[ps][pa]; tdError = delta; if (replacing_traces) { e[ps][pa] = 1.0; } else { e[ps][pa] += 1.0; } real ad = alpha*delta; real gl = gamma * lambda; real variance_threshold = 0.0001f; if (confidence_eligibility == false) { vQ[ps][pa] = (1.0f - zeta)*vQ[ps][pa] + zeta*(ad*ad); if (vQ[ps][pa]<variance_threshold) { vQ[ps][pa]=variance_threshold; } } if (ps<min_el_state) min_el_state = ps; if (ps>max_el_state) max_el_state = ps; for (i=0; i<n_states; i++) { //for (int i=min_el_state; i<=max_el_state; i++) { bool el=true; for (int j=0; j<n_actions; j++) { if (e[i][j]>0.01) { Q[i][j] += ad * e[i][j]; if (confidence_eligibility == true) { real zeta_el = zeta * e[i][j]; vQ[i][j] = (1.0f - zeta_el)*vQ[i][j] + zeta_el*(ad*ad); if (vQ[i][j]<variance_threshold) { vQ[i][j]=variance_threshold; } } //this is the same as setting e[ps][pa] += (1-P[ps][pa]) // if P[][] remains unchanged between updates. // -- removed because it doesn't work! -- //P[i][j] += 0.01*delta * e[i][j] * (1.-P[i][j]); if ((fabs (Q[i][j])>1000.0)||(isnan(Q[i][j]))) { printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]); } //This is only needed for Qlearning, but sarsa is not //affected since always amax==a; if (amax==a) { e[i][j] *= gl; } else { e[i][j] = 0.0; } } else { e[i][j] = 0.0; el = false; } } if (el==false) { if (min_el_state==i) min_el_state++; } else { max_el_state = i; } } } //printf ("%d %d #STATE\n", min_el_state, max_el_state); // printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n", // ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl); ps = s; pa = a; return a; }
int main(int argc, char *argv[]) { float bestKnownCost = FLT_MAX; int* bestKnownTour; FILE *file; int i, j, k; file = fopen(argv[1], "r"); M = atoi(argv[2]); runParallel = atoi(argv[3]); fscanf(file, "NAME : %[^\n]s", ins.name); fscanf(file, "\nCOMMENT : %[^\n]s", ins.comment); fscanf(file, "\nTYPE : %[^\n]s", ins.type); fscanf(file, "\nDIMENSION : %d", &ins.dimension); fscanf(file, "\nEDGE_WEIGHT_TYPE : %[^\n]s", ins.wtype); fscanf(file, "\nNODE_COORD_SECTION"); if (strcmp(ins.wtype,"EUC_2D")) { return 1; } if (runParallel) { printf("Running parallel version\n"); } else { printf("Running standard version\n"); } int** coord; coord = (int**) malloc(ins.dimension * sizeof(int*)); bestKnownTour = (int*) malloc(ins.dimension * sizeof(int)); for(i = 0; i <ins.dimension; i++) { coord[i] = (int*) malloc(2 * sizeof(int)); } for (i = 0; i < ins.dimension; i++) { fscanf(file, "\n %*[^ ] %d %d", &coord[i][0], &coord[i][1]); } fclose(file); distanceMatrix = (double**) malloc(ins.dimension * sizeof(double*)); for (i = 0; i < ins.dimension; i++) { distanceMatrix[i] = (double*) malloc(ins.dimension * sizeof(double)); } for (i = 0; i < ins.dimension; i++){ for ( j = i + 1; j < ins.dimension ; j++) { distanceMatrix[i][j] = dist(coord[i][0], coord[i][1], coord[j][0], coord[j][1]); distanceMatrix[j][i] = distanceMatrix[i][j]; } } free(coord); tau = (double**) malloc(ins.dimension*sizeof(double*)); for (i = 0; i < ins.dimension; i++) { tau[i] = (double*) malloc(ins.dimension * sizeof(double)); } for (i = 0; i < ins.dimension; i++) { for( j = 0; j < ins.dimension; j++) { tau[i][j] = TAU0; } } srand(time(NULL)); for (i = 0; i < M; i++) { ant[i].tour = (int*) malloc(ins.dimension * sizeof(int)); ant[i].tour[0] = rand() % ins.dimension; ant[i].visited = (int*) malloc(ins.dimension * sizeof(int)); for (j = 1; j < ins.dimension; j++) { ant[i].visited[j] = 0; } ant[i].visited[ant[i].tour[0]] = 1; } cl = (int**) malloc(ins.dimension * sizeof(int*)); int* visited = (int*) malloc(ins.dimension * sizeof(int)); double nearestDistance; int nearestNeighbor; for (i = 0; i < ins.dimension; i++) { cl[i] = (int*) malloc(CLSIZE * sizeof(int)); } for (i = 0; i < ins.dimension; i++) { for(j = 0; j < ins.dimension; j++) { visited[j] = 0; } for(j = 0; j < CLSIZE; j++) { nearestDistance = DBL_MAX; nearestNeighbor = 0; for(k = 0; k < ins.dimension; k++) { if(!visited[k] && distanceMatrix[i][k] < nearestDistance && i != k) { nearestDistance = distanceMatrix[i][k]; nearestNeighbor = k; } } visited[nearestNeighbor] = 1; cl[i][j] = nearestNeighbor; } } pthread_t *thread; thread = (pthread_t*) malloc(M * sizeof(pthread_t)); for (i = 0; i < ITERATIONS; i++) { if (runParallel) { for(j = 0; j < M; j++) { int st = pthread_create(&thread[j], NULL, routeAnt, (void *) j); if (st) { printf("pthread_create() error %d\n",st); exit(-1); } } for (j = 0; j < M; j++) { pthread_join(thread[j],NULL); } } else { for (j = 0; j < M; j++) { for (k = 1; k < ins.dimension; k++) { if (hasCandidatesLeft(j,ant[j].tour[k-1])) { moveAntTo(j, k, argMax(j, ant[j].tour[k-1])); } else { moveAntTo(j, k, NN(j,k-1)); } } } } int bestAnt = 0; double lowerCost = DBL_MAX; for (j = 0; j < M; j++) { if (tourCost(ant[j].tour) < lowerCost) { bestAnt = j; lowerCost = tourCost(ant[j].tour); } } updatePheromoneLevel(ant[bestAnt].tour); if (lowerCost < bestKnownCost) { bestKnownCost = lowerCost; for (k = 0; k < ins.dimension; k++) { bestKnownTour[k] = ant[bestAnt].tour[k]; } } for (j = 0; j < M; j++) { for (k = 0; k < ins.dimension; k++) { ant[j].visited[k] = 0; } ant[j].tour[0] = rand() % ins.dimension; ant[j].visited[ant[j].tour[0]] = 1; for (k = 1; k < ins.dimension; k++) { ant[j].tour[k] = 0; } } } printf("Best tour: "); for (j = 0; j < ins.dimension; j++) { printf("%d ", bestKnownTour[j]); } printf("; Length = %f\n", bestKnownCost); return 0; }