コード例 #1
0
ファイル: policy.cpp プロジェクト: rongzhou/speed-dreams
int DiscretePolicy::eGreedy(real* Qs) {
    real X = urandom();
    int amax = argMax(Qs);
    real base_prob = temp/((real) n_actions);
    for (int a=0; a<n_actions; a++) {
        eval[a] = base_prob;
    }
    eval[amax] += 1.0f-temp;
    if (X<temp) {
        return rand()%n_actions;
    }
    return argMax(Qs);
}
コード例 #2
0
ファイル: policy.cpp プロジェクト: rongzhou/speed-dreams
int DiscretePolicy::confSample(real* Qs, real* vQs) {
    static NormalDistribution gaussian;
    static LaplacianDistribution laplacian;
    static UniformDistribution uniform;

    for (int a=0; a<n_actions; a++) {
        //eval[a] = Qs[a] + urandom(-1.0,1.0)*vQs[a];
        switch(confidence_distribution) {
        case SINGULAR:
            sample[a] = Qs[a];
            break;
        case BOUNDED:
            uniform.setMean(Qs[a]);
            uniform.setVariance(vQs[a]);
            sample[a] = uniform.generate();
            break;
        case GAUSSIAN:
            gaussian.setMean(Qs[a]);
            gaussian.setVariance(vQs[a]);
            sample[a] = gaussian.generate();
            break;
        case LAPLACIAN:
            laplacian.setMean(Qs[a]);
            laplacian.setVariance(vQs[a]);
            sample[a] = Qs[a] + laplacian.generate();
            break;
        default:
            Serror ("Unknown distribution ID:%d\n", confidence_distribution);
            break;
        }
    }
    return argMax(sample);
}
コード例 #3
0
void argMaxReduce(int lid, size_t lsize, __local float *values, __local int *indices){
    for(int offset = lsize/2; offset > 0; offset >>= 1){
        if(lid < offset){
            argMax(values[lid], indices[lid], values[lid + offset], indices[lid + offset], values + lid, indices + lid);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
コード例 #4
0
ファイル: tsp-acs.c プロジェクト: piperamirez/tsp-acs
void *routeAnt(void *id) {
	int j, i = (int)id;
	for(j=1;j<ins.dimension;j++){
		if(hasCandidatesLeft(i,ant[i].tour[j-1]))
			moveAntTo(i,j,argMax(i,ant[i].tour[j-1]));
		else
			moveAntTo(i,j,NN(i,j-1));
	}
}
コード例 #5
0
ファイル: policy.cpp プロジェクト: rongzhou/speed-dreams
/// Delete policy.
DiscretePolicy::~DiscretePolicy()
{
    real sum = 0.0;
    FILE* f = fopen ("/tmp/discrete","wb");

    int s;
    for (s=0; s<n_states; s++) {
        sum += Q[s][argMax(Q[s])];
        if (f) {
            //softMax(Q[s]);
            real sum2=0.0;
            int a;
            for (a=0; a<n_actions; a++) {
                sum2 += eval[a];
            }
            for (a=0; a<n_actions; a++) {
                fprintf (f, "%f ", Q[s][a]);
            }
            for (a=0; a<n_actions; a++) {
                fprintf (f, "%f ", P[s][a]);
            }
            for (a=0; a<n_actions; a++) {
                fprintf (f, "%f ", vQ[s][a]);
            }
            fprintf (f, "\n");
        }
    }

    if (f) {
        fclose (f);
    }

    logmsg ("#Expected return of greedy policy over random distribution of states: %f\n", sum/((real) n_states));

    for (s=0; s<n_states; s++) {
        delete [] P[s];
        delete [] Q[s];
        delete [] e[s];
        delete [] vQ[s];
    }
    delete [] P;
    delete [] Q;
    delete [] vQ;
    delete [] e;
    delete [] eval;
    delete [] sample;
}
コード例 #6
0
ファイル: ann_policy.cpp プロジェクト: COHRINT/cuTORCS
int ANN_Policy::SelectAction (real* s, real r, int forced_a)
{
	int a; // selected action
	int amax; //maximum evaluated action
	real* Q_s; // pointer to evaluations for state s
	if (confidence) {
		if (separate_actions) {
			for (int i=0; i<n_actions; i++) {
				ANN_StochasticInput (Ja[i], s);
				JQs[i] = ANN_GetOutput(Ja[i])[0];
			}
			Q_s = JQs;
		} else {
			ANN_StochasticInput (J, s);
			Q_s = ANN_GetOutput (J);
		}
	} else {
		if (separate_actions) {
			for (int i=0; i<n_actions; i++) {
				ANN_Input (Ja[i], s);
				JQs[i] = ANN_GetOutput(Ja[i])[0];
			}
			Q_s = JQs;
		} else {
			ANN_Input (J, s);
			Q_s = ANN_GetOutput (J);
		}
	}
	int argmax = argMax (Q_s);

	if (forced_learning) {
		a = forced_a;
	} else if (confidence) {
		a = argmax;
	} else if (smax) {
		a = softMax (Q_s);
		//printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
	} else {
		a = eGreedy (Q_s);
	}

	if (a<0 || a>=n_actions) {
		fprintf (stderr, "Action %d out of bounds\n", a);
	}

	switch (learning_method) {
		
	case Sarsa: 
		amax = a;
		break;
	case QLearning:
		amax = argmax;
		break;
	default:
		amax = a;
		fprintf (stderr, "Unknown learning method\n");
	}
	if (pa>=0) { // do not update at start of episode
		real delta = r + gamma*Q_s[amax] - J_ps_pa;
		tdError = delta;
		for (int j=0; j<n_actions; j++) {
			delta_vector[j] = 0.0;
		}
		if (separate_actions) {
			if (eligibility) {
				delta_vector[0] = 1.0;
				ANN_Delta_Train (Ja[pa], delta_vector, delta);
				// Reset other actions' traces.
				for (int i=0; i<n_actions; i++) {
					if (i!=pa) {
						ANN_Reset(Ja[i]);
					}
				}
			} else {
				delta_vector[0] = delta;
				ANN_Delta_Train (Ja[pa], delta_vector, 0.0);
			}
		} else {
			if (J->eligibility_traces) {
				delta_vector[pa] = 1.0;
				ANN_Delta_Train (J, delta_vector, delta);
			} else {
				delta_vector[pa] = delta;
				ANN_Delta_Train (J, delta_vector, 0.0);
			}
		}


	}

	//printf ("%d %d #STATE\n", min_el_state, max_el_state);
	//	printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
	//			ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);

	J_ps_pa = Q_s[a];
	pa = a;

	return a;
}
コード例 #7
0
ファイル: policy.cpp プロジェクト: rongzhou/speed-dreams
/// Load policy from a file.
void DiscretePolicy::loadFile (char* f)
{
    FILE* fh = NULL;
    size_t readSize;
    fh = fopen (f, "rb");
    if (fh==NULL) {
        fprintf (stderr, "Failed to read file %s\n", f);
        return;
    }
    char rtag[256];
    const char* start_tag="QSA";
    const char* close_tag="END";
    int n_read_states, n_read_actions;

    readSize = fread((void *) rtag, sizeof (char), strlen (start_tag)+1, fh);
    if( readSize < strlen(start_tag)+1 )
        fprintf(stderr, "Error when reading file");
    if (strcmp (rtag, start_tag)) {
        fprintf (stderr, "Could not find starting tag\n");
        return;
    }
    readSize = fread((void *) &n_read_states, sizeof(int), 1, fh);
    if( readSize < 1 )
        fprintf(stderr, "Error when reading file");
    readSize = fread((void *) &n_read_actions, sizeof(int), 1, fh);
    if( readSize < 1 )
        fprintf(stderr, "Error when reading file");

    if ((n_read_states!=n_states)||(n_read_actions!=n_actions)) {
        fprintf (stderr, "File has %dx%d space! Aborting read.\n", n_read_states, n_read_actions);
        fclose(fh);
        return;
    }

    int i, j;
    for (i=0; i<n_states; i++) {
        readSize = fread((void *) Q[i], sizeof(real), n_actions, fh);
        if( readSize < (int unsigned)n_actions )
            fprintf(stderr, "Error when reading file");
        for (j=0; j<n_actions; j++) {
            if ((fabs (Q[i][j])>100.0)||(isnan(Q[i][j]))) {
                printf ("l: %d %d %f\n", i,j,Q[i][j]);
                Q[i][j] = 0.0;
            }
        }
    }
    for (i=0; i<n_states; i++) {
        for (j=0; j<n_actions; j++) {
            {
                P[i][j] = 1.0f/((real) n_actions);
            }
        }
        int argmax = argMax (Q[i]);
        P[i][argmax] += 0.001f*(1.0f-P[i][argmax]);
        for (int j=0; j<n_actions; j++) {
            if (j!=argmax) {
                P[i][j] += 0.001f*(0.0f-P[i][j]);
            }
        }
    }



    readSize = fread((void *) rtag, sizeof (char), strlen (close_tag)+1, fh);
    if( readSize < strlen(close_tag)+1 )
        fprintf(stderr, "Error when reading file");
    if (strcmp (rtag, close_tag)) {
        fprintf (stderr, "Could not find ending tag\n");
        fclose (fh);
        return;
    }


    fclose (fh);
}
コード例 #8
0
ファイル: policy.cpp プロジェクト: rongzhou/speed-dreams
/** Select an action a, given state s and reward from previous action.

   Optional argument a forces an action if setForcedLearning() has
   been called with true.

   Two algorithms are implemented, both of which converge. One of them
   calculates the value of the current policy, while the other that of
   the optimal policy.

   Sarsa (\f$\lambda\f$) algorithmic description:

   1. Take action \f$a\f$, observe \f$r, s'\f$

   2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$

   3. \f$\delta = r + \gamma Q(s',a') - Q(s,a)\f$

   4. \f$e(s,a) = e(s,a)+ 1\f$, depending on trace settings

   5. for all \f$s,a\f$ :
   \f[
   Q_{t}(s,a) = Q_{t-1}(s,a) + \alpha \delta e_{t}(s,a),
   \f]
where \f$e_{t}(s,a) = \gamma \lambda e_{t-1}(s,a)\f$

	  end

   6. \f$a = a'\f$ (we will take this action at the next step)

   7. \f$s = s'\f$

   Watkins Q (l) algorithmic description:

   1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$

   2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$

   3. \f$a* = \arg \max_b Q(s',b)\f$

   3. \f$\delta = r + \gamma Q(s',a^*) - Q(s,a)\f$

   4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces

   5. for all \f$s,a\f$ :
\f[
        Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
\f]
		if \f$(a'=a*)\f$ then \f$e(s,a)\f$ = \f$\gamma \lambda e(s,a)\f$
		           else \f$e(s,a) = 0\f$
	  end

   6. \f$a = a'\f$ (we will take this action at the next step)

   7. \f$s = s'\f$

   The most general algorithm is E-learning, currently under
   development, which is defined as follows:

   1. Take action \f$a\f$, observe \f$r\f$, \f$s'\f$

   2. Choose \f$a'\f$ from \f$s'\f$ using some policy derived from \f$Q\f$

   3. \f$\delta = r + \gamma E{Q(s',a^*)|\pi} - Q(s,a)\f$

   4. \f$e(s,a) = e(s,a)+ 1\f$, depending on eligibility traces

   5. for all \f$s,a\f$ :
\f[
        Q(s,a) = Q(s,a)+\alpha \delta e(s,a)
\f]
		\f$e(s,a)\f$ = \f$\gamma \lambda e(s,a) P(a|s,\pi) \f$

   6. \f$a = a'\f$ (we will take this action at the next step)

   7. \f$s = s'\f$

   Note that we also cut off the eligibility traces that have fallen below 0.1


*/
int DiscretePolicy::SelectAction (int s, real r, int forced_a)
{
    if ((s<0)||(s>=n_states)) {
        return 0;
    }

    if ((ps>=0)&&(pa>=0)) {
        expected_r += r;
        expected_V += Q[ps][pa];
        n_samples++;

        if (s==0) {
            real max_estimate = 0.0;
            real max_estimate_k = 0.0;
            for (int i=0; i<n_states; i++) {
                max_estimate += Q[i][argMax (Q[i])];
                max_estimate_k += 1.0;
            }

#if 0
            logmsg ("%f %f %f %f#rTVV\n",
                    expected_r/((real) n_samples),
                    temp,
                    expected_V/((real) n_samples),
                    max_estimate/max_estimate_k);
#endif
            expected_r = 0.0;
            expected_V= 0.0;
            n_samples = 0;
        }
    }
    int a, amax;
    int argmax = argMax (Q[s]);

    P[s][argmax] += zeta*(1.0f-P[s][argmax]);
    for (int j=0; j<n_actions; j++) {
        if (j!=argmax) {
            P[s][j] += zeta*(0.0f-P[s][j]);
        }
    }



    if (forced_learning) {
        a = forced_a;
    } else if (pursuit) {
        real sum = 0.0;
        a = -1;
        int j;
        for (j=0; j<n_actions; j++) {
            sum += P[s][j];
        }
        real X = urandom()*sum;
        real dsum=0.0;
        for (j=0; j<n_actions; j++) {
            dsum += P[s][j];
            if (X<=dsum) {
                a = j;
                break;
            }
        }
        if (a==-1) {
            fprintf (stderr, "No action selected with pursuit!\n");
        }
    } else if (confidence) {
        if (confidence_uses_gibbs && (confidence_distribution == SINGULAR)) {
            a = confMax (Q[s],vQ[s]);
        } else {
            a = confSample (Q[s], vQ[s]);
            if (confidence_uses_gibbs) { // and not SINGULAR distribution
                a = softMax(sample); //use softmax on the sample values
            }
        }
    } else if (reliability_estimate) {
        temp = sqrt(Sum(vQ[s], n_actions)/((real) n_actions));
        //temp = 0.1;
        a = softMax(Q[s]);
        //printf ("%f\n", temp);
    } else if (smax) {
        a = softMax (Q[s]);
        //printf ("Q[%d][%d]=%f\n", s, a, Q[s][a]);
    } else {
        a = eGreedy (Q[s]);
    }

    if (a<0 || a>=n_actions) {
        fprintf (stderr, "Action %d out of bounds.. ", a);
        a = (int) floor (urandom()*((real) n_actions));
        fprintf (stderr, "mapping to %d\n", a);
    }

    real EQ_s = 0.0;
    int i;

    switch (learning_method) {

    case Sarsa:
        amax = a;
        EQ_s = Q[s][amax];
        break;
    case QLearning:
        amax = argmax;
        EQ_s = Q[s][amax];
        break;
    case ELearning:
        amax = a; //? correct ?
        Normalise(eval, eval, n_actions);
        EQ_s = 0.0;
        for (i=0; i<n_actions; i++) {
            EQ_s += eval[i] * Q[s][i];
        }
        break;
    default:
        amax = a;
        EQ_s = Q[s][amax];
        fprintf (stderr, "Unknown learning method\n");
    }
    if ((ps>=0)&&(pa>=0)) { // do not update at start of episode
        real delta = r + gamma*EQ_s - Q[ps][pa];
        tdError = delta;
        if (replacing_traces) {
            e[ps][pa] = 1.0;
        } else {
            e[ps][pa] += 1.0;
        }
        real ad = alpha*delta;
        real gl = gamma * lambda;
        real variance_threshold = 0.0001f;
        if  (confidence_eligibility == false) {
            vQ[ps][pa] = (1.0f - zeta)*vQ[ps][pa] + zeta*(ad*ad);
            if (vQ[ps][pa]<variance_threshold) {
                vQ[ps][pa]=variance_threshold;
            }
        }
        if (ps<min_el_state) min_el_state = ps;
        if (ps>max_el_state) max_el_state = ps;


        for (i=0; i<n_states; i++) {
            //for (int i=min_el_state; i<=max_el_state; i++) {
            bool el=true;
            for (int j=0; j<n_actions; j++) {
                if (e[i][j]>0.01) {
                    Q[i][j] += ad * e[i][j];
                    if (confidence_eligibility == true) {
                        real zeta_el = zeta * e[i][j];
                        vQ[i][j] = (1.0f - zeta_el)*vQ[i][j] + zeta_el*(ad*ad);
                        if (vQ[i][j]<variance_threshold) {
                            vQ[i][j]=variance_threshold;
                        }
                    }
                    //this is the same as setting e[ps][pa] += (1-P[ps][pa])
                    // if P[][] remains unchanged between updates.
                    // -- removed because it doesn't work! --
                    //P[i][j] += 0.01*delta * e[i][j] * (1.-P[i][j]);
                    if ((fabs (Q[i][j])>1000.0)||(isnan(Q[i][j]))) {
                        printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]);
                    }

                    //This is only needed for Qlearning, but sarsa is not
                    //affected since always amax==a;
                    if (amax==a) {
                        e[i][j] *= gl;
                    } else {
                        e[i][j] = 0.0;
                    }
                } else {
                    e[i][j] = 0.0;
                    el = false;
                }
            }
            if (el==false) {
                if (min_el_state==i)
                    min_el_state++;
            } else {
                max_el_state = i;
            }
        }
    }

    //printf ("%d %d #STATE\n", min_el_state, max_el_state);
    //	printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n",
    //			ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl);
    ps = s;
    pa = a;

    return a;
}
コード例 #9
0
ファイル: tsp-acs.c プロジェクト: piperamirez/tsp-acs
int main(int argc, char *argv[]) {
	float bestKnownCost = FLT_MAX;
	int* bestKnownTour;

	FILE *file;
	int i, j, k;
	file = fopen(argv[1], "r");
	M = atoi(argv[2]);
	runParallel = atoi(argv[3]);
	fscanf(file, "NAME : %[^\n]s", ins.name);
	fscanf(file, "\nCOMMENT : %[^\n]s", ins.comment);
	fscanf(file, "\nTYPE : %[^\n]s", ins.type);
	fscanf(file, "\nDIMENSION : %d", &ins.dimension);
	fscanf(file, "\nEDGE_WEIGHT_TYPE : %[^\n]s", ins.wtype);
	fscanf(file, "\nNODE_COORD_SECTION");
	if (strcmp(ins.wtype,"EUC_2D")) {
		return 1;
	}
	if (runParallel) {
		printf("Running parallel version\n");
	}
	else {
		printf("Running standard version\n");
	}
	int** coord;
	coord = (int**) malloc(ins.dimension * sizeof(int*));
	bestKnownTour = (int*) malloc(ins.dimension * sizeof(int));
	for(i = 0; i <ins.dimension; i++) {
		coord[i] = (int*) malloc(2 * sizeof(int));
	}
	for (i = 0; i < ins.dimension; i++) {
		fscanf(file, "\n %*[^ ] %d %d", &coord[i][0], &coord[i][1]);
	}
	fclose(file);
	
	distanceMatrix = (double**) malloc(ins.dimension * sizeof(double*));
	for (i = 0; i < ins.dimension; i++) {
		distanceMatrix[i] = (double*) malloc(ins.dimension * sizeof(double));
	}
	
	for (i = 0; i < ins.dimension; i++){
		for ( j = i + 1; j < ins.dimension ; j++) {
			distanceMatrix[i][j] = dist(coord[i][0], coord[i][1], coord[j][0], coord[j][1]);
			distanceMatrix[j][i] = distanceMatrix[i][j];
		}
	}
	free(coord);

	tau = (double**) malloc(ins.dimension*sizeof(double*));
	for (i = 0; i < ins.dimension; i++) {
		tau[i] = (double*) malloc(ins.dimension * sizeof(double));
	}
	for (i = 0; i < ins.dimension; i++) {
		for( j = 0; j < ins.dimension; j++) {
			tau[i][j] = TAU0;
		}
	}
	
	srand(time(NULL));
	for (i = 0; i < M; i++) {
		ant[i].tour = (int*) malloc(ins.dimension * sizeof(int));
		ant[i].tour[0] = rand() % ins.dimension;
		ant[i].visited = (int*) malloc(ins.dimension * sizeof(int));
		for (j = 1; j < ins.dimension; j++) {
			ant[i].visited[j] = 0;
		}
		ant[i].visited[ant[i].tour[0]] = 1;
	}
	
	cl = (int**) malloc(ins.dimension * sizeof(int*));
	int* visited = (int*) malloc(ins.dimension * sizeof(int));
	double nearestDistance;
	int nearestNeighbor;
	for (i = 0; i < ins.dimension; i++) {
		cl[i] = (int*) malloc(CLSIZE * sizeof(int));
	}
	for (i = 0; i < ins.dimension; i++) {
		for(j = 0; j < ins.dimension; j++) {
			visited[j] = 0;
		}
		for(j = 0; j < CLSIZE; j++) {
			nearestDistance = DBL_MAX;
			nearestNeighbor = 0;
			for(k = 0; k < ins.dimension; k++) {
				if(!visited[k] && distanceMatrix[i][k] < nearestDistance && i != k) {
					nearestDistance = distanceMatrix[i][k];
					nearestNeighbor = k;
				}
			}
			visited[nearestNeighbor] = 1;
			cl[i][j] = nearestNeighbor;
		}
	}
	
	pthread_t *thread;
	thread = (pthread_t*) malloc(M * sizeof(pthread_t));
	
	for (i = 0; i < ITERATIONS; i++) {
		if (runParallel) {
			for(j = 0; j < M; j++) {
				int st = pthread_create(&thread[j], NULL, routeAnt, (void *) j);
				if (st) {
					printf("pthread_create() error %d\n",st);
					exit(-1);
				}
			}
		
			for (j = 0; j < M; j++) {
				pthread_join(thread[j],NULL);
			}
			
		}
		else {
			for (j = 0; j < M; j++) {
				for (k = 1; k < ins.dimension; k++) {
					if (hasCandidatesLeft(j,ant[j].tour[k-1])) {
						moveAntTo(j, k, argMax(j, ant[j].tour[k-1]));
					}
					else {
						moveAntTo(j, k, NN(j,k-1));
					}
				}
			}
		}
		
		int bestAnt = 0;
		double lowerCost = DBL_MAX;
		for (j = 0; j < M; j++) {
			if (tourCost(ant[j].tour) < lowerCost) {
				bestAnt = j;
				lowerCost = tourCost(ant[j].tour);
			}
		}

		updatePheromoneLevel(ant[bestAnt].tour);
		
		if (lowerCost < bestKnownCost) {
			bestKnownCost = lowerCost;
			for (k = 0; k < ins.dimension; k++) {
				bestKnownTour[k] = ant[bestAnt].tour[k];
			}
		}

		for (j = 0; j < M; j++) {
			for (k = 0; k < ins.dimension; k++) {
				ant[j].visited[k] = 0;
			}
			ant[j].tour[0] = rand() % ins.dimension;
			ant[j].visited[ant[j].tour[0]] = 1;
			for (k = 1; k < ins.dimension; k++) {
				ant[j].tour[k] = 0;
			}
		}
	}
	
	printf("Best tour: ");
	for (j = 0; j < ins.dimension; j++) {
		printf("%d ", bestKnownTour[j]);
	}
	printf("; Length = %f\n", bestKnownCost);

	return 0;
}