void run(int argc, char** argv) { init(argc, argv); unsigned long long cycles; int *src, *dst, *temp; int min; dst = result; src = new int[cols]; #ifdef GEM5_WORK m5_work_begin(0, 0); m5_dumpreset_stats(0, 0); #endif #ifdef TIMING double start_time = gettime(); #endif for (int t = 0; t < rows-1; t++) { temp = src; src = dst; dst = temp; #pragma omp parallel for private(min) for(int n = 0; n < cols; n++){ min = src[n]; if (n > 0) min = MIN(min, src[n-1]); if (n < cols-1) min = MIN(min, src[n+1]); dst[n] = wall[t+1][n]+min; } } #ifdef TIMING double end_time = gettime(); printf("ROI Runtime: %f\n", end_time - start_time); #endif #ifdef GEM5_WORK m5_dumpreset_stats(0, 0); m5_work_end(0, 0); #endif #ifdef OUTPUT for (int i = 0; i < cols; i++) printf("%d ",data[i]); printf("\n") ; for (int i = 0; i < cols; i++) printf("%d ",dst[i]); printf("\n") ; #endif delete [] data; delete [] wall;
void do_dump_reset_stats(int argc, char *argv[]) { uint64_t ints[2]; parse_int_args(argc, argv, ints, 2); m5_dumpreset_stats(ints[0], ints[1]); }
/** * The implementation of the particle filter using OpenMP for many frames * @see http://openmp.org/wp/ * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @param I The video to be run * @param IszX The x dimension of the video * @param IszY The y dimension of the video * @param Nfr The number of frames * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used */ void particleFilter(int * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles){ int max_size = IszX*IszY*Nfr; long long start = get_time(); //original particle centroid double xe = roundDouble(IszY/2.0); double ye = roundDouble(IszX/2.0); //expected object locations, compared to center int radius = 5; int diameter = radius*2 - 1; int * disk = (int *)malloc(diameter*diameter*sizeof(int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for(x = 0; x < diameter; x++){ for(y = 0; y < diameter; y++){ if(disk[x*diameter + y] == 1) countOnes++; } } double * objxy = (double *)malloc(countOnes*2*sizeof(double)); getneighbors(disk, countOnes, objxy, radius); long long get_neighbors = get_time(); printf("TIME TO GET NEIGHBORS TOOK: %f\n", elapsed_time(start, get_neighbors)); //initial weights are all equal (1/Nparticles) double * weights = (double *)malloc(sizeof(double)*Nparticles); #pragma omp parallel for shared(weights, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = 1/((double)(Nparticles)); } long long get_weights = get_time(); printf("TIME TO GET WEIGHTSTOOK: %f\n", elapsed_time(get_neighbors, get_weights)); //initial likelihood to 0.0 double * likelihood = (double *)malloc(sizeof(double)*Nparticles); double * arrayX = (double *)malloc(sizeof(double)*Nparticles); double * arrayY = (double *)malloc(sizeof(double)*Nparticles); double * xj = (double *)malloc(sizeof(double)*Nparticles); double * yj = (double *)malloc(sizeof(double)*Nparticles); double * CDF = (double *)malloc(sizeof(double)*Nparticles); double * u = (double *)malloc(sizeof(double)*Nparticles); int * ind = (int*)malloc(sizeof(int)*countOnes*Nparticles); #pragma omp parallel for shared(arrayX, arrayY, xe, ye) private(x) for(x = 0; x < Nparticles; x++){ arrayX[x] = xe; arrayY[x] = ye; } int k; printf("TIME TO SET ARRAYS TOOK: %f\n", elapsed_time(get_weights, get_time())); #ifdef GEM5_WORK m5_work_begin(0, 0); m5_dumpreset_stats(0, 0); #endif int indX, indY; for(k = 1; k < Nfr; k++){ long long set_arrays = get_time(); //apply motion model //draws sample from motion model (random walk). The only prior information //is that the object moves 2x as fast as in the y direction #pragma omp parallel for shared(arrayX, arrayY, Nparticles, seed) private(x) for(x = 0; x < Nparticles; x++){ arrayX[x] += 1 + 5*randn(seed, x); arrayY[x] += -2 + 2*randn(seed, x); } long long error = get_time(); printf("TIME TO SET ERROR TOOK: %f\n", elapsed_time(set_arrays, error)); //particle filter likelihood #pragma omp parallel for shared(likelihood, I, arrayX, arrayY, objxy, ind) private(x, y, indX, indY) for(x = 0; x < Nparticles; x++){ //compute the likelihood: remember our assumption is that you know // foreground and the background image intensity distribution. // Notice that we consider here a likelihood ratio, instead of // p(z|x). It is possible in this case. why? a hometask for you. //calc ind for(y = 0; y < countOnes; y++){ indX = roundDouble(arrayX[x]) + objxy[y*2 + 1]; indY = roundDouble(arrayY[x]) + objxy[y*2]; ind[x*countOnes + y] = fabs(indX*IszY*Nfr + indY*Nfr + k); if(ind[x*countOnes + y] >= max_size) ind[x*countOnes + y] = 0; } likelihood[x] = 0; for(y = 0; y < countOnes; y++) likelihood[x] += (pow((I[ind[x*countOnes + y]] - 100),2) - pow((I[ind[x*countOnes + y]]-228),2))/50.0; likelihood[x] = likelihood[x]/((double) countOnes); } long long likelihood_time = get_time(); printf("TIME TO GET LIKELIHOODS TOOK: %f\n", elapsed_time(error, likelihood_time)); // update & normalize weights // using equation (63) of Arulampalam Tutorial #pragma omp parallel for shared(Nparticles, weights, likelihood) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = weights[x] * exp(likelihood[x]); } long long exponential = get_time(); printf("TIME TO GET EXP TOOK: %f\n", elapsed_time(likelihood_time, exponential)); double sumWeights = 0; #pragma omp parallel for private(x) reduction(+:sumWeights) for(x = 0; x < Nparticles; x++){ sumWeights += weights[x]; } long long sum_time = get_time(); printf("TIME TO SUM WEIGHTS TOOK: %f\n", elapsed_time(exponential, sum_time)); #pragma omp parallel for shared(sumWeights, weights) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = weights[x]/sumWeights; } long long normalize = get_time(); printf("TIME TO NORMALIZE WEIGHTS TOOK: %f\n", elapsed_time(sum_time, normalize)); xe = 0; ye = 0; // estimate the object location by expected values #pragma omp parallel for private(x) reduction(+:xe, ye) for(x = 0; x < Nparticles; x++){ xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } long long move_time = get_time(); printf("TIME TO MOVE OBJECT TOOK: %f\n", elapsed_time(normalize, move_time)); printf("XE: %lf\n", xe); printf("YE: %lf\n", ye); double distance = sqrt( pow((double)(xe-(int)roundDouble(IszY/2.0)),2) + pow((double)(ye-(int)roundDouble(IszX/2.0)),2) ); printf("%lf\n", distance); //display(hold off for now) //pause(hold off for now) //resampling CDF[0] = weights[0]; for(x = 1; x < Nparticles; x++){ CDF[x] = weights[x] + CDF[x-1]; } long long cum_sum = get_time(); printf("TIME TO CALC CUM SUM TOOK: %f\n", elapsed_time(move_time, cum_sum)); double u1 = (1/((double)(Nparticles)))*randu(seed, 0); #pragma omp parallel for shared(u, u1, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ u[x] = u1 + x/((double)(Nparticles)); } long long u_time = get_time(); printf("TIME TO CALC U TOOK: %f\n", elapsed_time(cum_sum, u_time)); int j, i; #pragma omp parallel for shared(CDF, Nparticles, xj, yj, u, arrayX, arrayY) private(i, j) for(j = 0; j < Nparticles; j++){ i = findIndex(CDF, Nparticles, u[j]); if(i == -1) i = Nparticles-1; xj[j] = arrayX[i]; yj[j] = arrayY[i]; } long long xyj_time = get_time(); printf("TIME TO CALC NEW ARRAY X AND Y TOOK: %f\n", elapsed_time(u_time, xyj_time)); //#pragma omp parallel for shared(weights, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ //reassign arrayX and arrayY arrayX[x] = xj[x]; arrayY[x] = yj[x]; weights[x] = 1/((double)(Nparticles)); } long long reset = get_time(); printf("TIME TO RESET WEIGHTS TOOK: %f\n", elapsed_time(xyj_time, reset)); } #ifdef GEM5_WORK m5_dumpreset_stats(0, 0); m5_work_end(0, 0); #endif free(disk); free(objxy); free(weights); free(likelihood); free(xj); free(yj); free(arrayX); free(arrayY); free(CDF); free(u); free(ind); }
int main(int argc, char** argv) { unsigned i; int fd = 0; pthread_t *threads; pthread_attr_t pthread_custom_attr; for (int index = 0; index < argc; index++) { if (strcmp(argv[index], "-t") == 0) { if (argc > index+1) { numThreads = atoi(argv[++index]); } else { printf("ERROR: Must specify number of threads to -t option\n"); exit(0); } } else if (strcmp(argv[index], "-a") == 0) { if (argc > index+1) { arraySize = atoi(argv[++index]); } else { printf("ERROR: Must specify number of array elements to -a option\n"); exit(0); } } else if (strcmp(argv[index], "-d") == 0) { dumpStats = true; } else if (strcmp(argv[index], "-h") == 0) { printf("Usage: %s [options]\n\nOptions:\n", argv[0]); printf("-a # Specify number of array elements\n"); printf("-i # Specify number of iterations to run\n"); printf("-p # Specify number of parallel accesses per-thread\n"); printf("-q # Specify per-thread memory-level parallelism\n"); printf("-s # Specify memory access stride in ints\n"); printf("-t # Specify number of threads\n"); printf("-u Use uncacheable memory\n"); exit(0); } else if (strcmp(argv[index], "-i") == 0) { if (argc > index+1) { numIters = atoi(argv[++index]); } else { printf("ERROR: Must specify number of iterations to -i option\n"); exit(0); } } else if (strcmp(argv[index], "-p") == 0) { if (argc > index+1) { parallelAccesses = atoi(argv[++index]); } else { printf("ERROR: Must specify number of memory parallel accesses to -p option\n"); exit(0); } } else if (strcmp(argv[index], "-q") == 0) { if (argc > index+1) { parallelism = atoi(argv[++index]); } else { printf("ERROR: Must specify number of memory-level parallelism to -q option\n"); exit(0); } } else if (strcmp(argv[index], "-s") == 0) { if (argc > index+1) { stride = atoi(argv[++index]); } else { printf("ERROR: Must specify int stride to -s option\n"); exit(0); } } else if (strcmp(argv[index], "-u") == 0) { uncached = true; } } parallelAccesses -= (parallelAccesses % parallelism); int threadIDs[numThreads]; pthread_attr_init(&pthread_custom_attr); arraySize *= numThreads; if (!uncached) { bigArray = (int *) malloc(arraySize * sizeof(int)); } else { fd = open("/dev/mem", O_CREAT | O_RDWR | O_SYNC, 0755); if (fd < 0) { fprintf(stderr, "Open failed uncached\n"); exit(1); } bigArray = (int*) mmap(0x0, arraySize * sizeof(int), (PROT_READ | PROT_WRITE), (MAP_SHARED), fd, 0); if (bigArray == MAP_FAILED) { fprintf (stderr, "mmap uncached\n"); unlink("/dev/mem"); exit(1); } } smallArray = (int *) malloc(numThreads * sizeof(int)); threads = (pthread_t *) malloc(numThreads * sizeof(pthread_t)); lapTimes = (double*) malloc(numIters * sizeof(double)); printf("Number of threads: %u\n", numThreads); printf("Number of iterations: %u\n", numIters); printf("Memory-level parallelism: %u\n", parallelism); printf("Number of accesses: %u\n", parallelAccesses); printf("Number of array elements: %llu\n", arraySize); printf("Size of bigArray (MB): %.3f\n", (double)(arraySize * sizeof(int))/(1024.0*1024.0)); printf("Data stride: %u int (%luB) = %luB\n", stride, sizeof(int), stride * sizeof(int)); for (i = 0; i < arraySize; i++) { bigArray[i] = i; } if (pthread_barrier_init(&barr, NULL, numThreads)) { printf("Could not create a barrier\n"); return -1; } if (dumpStats) m5_dumpreset_stats(0, 0); for (i = 0; i < numThreads; i++) { threadIDs[i] = i; pthread_create(&threads[i], &pthread_custom_attr, worker, &threadIDs[i]); } for (i = 0; i < numThreads; i++) { pthread_join(threads[i], NULL); } if (dumpStats) m5_dumpreset_stats(0, 0); double stdev = 0.0; averageLap /= numIters; fprintf(stderr, "%d %d %llu", numThreads, parallelAccesses, arraySize); for (i = 0; i < numIters; i++) { fprintf(stderr, " %.6f", lapTimes[i]); if (i > 1) { double temp = averageLap - lapTimes[i]; stdev += temp * temp; } } fprintf(stderr, "\n"); stdev = sqrt(stdev/numIters); double percent = 100 * stdev / averageLap; printf("AVG: %.5f\n", averageLap); printf("STDEV: %f (%f%%)\n", stdev, percent); if (!uncached) { free(bigArray); } else { munmap(bigArray, arraySize * sizeof(int)); if (fd != 0) close(fd); unlink("/dev/mem"); } free(smallArray); free(threads); free(lapTimes); return 0; }