/** * The implementation of the particle filter using OpenMP for many frames * @see http://openmp.org/wp/ * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @param I The video to be run * @param IszX The x dimension of the video * @param IszY The y dimension of the video * @param Nfr The number of frames * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used */ void particleFilter(int * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles){ int max_size = IszX*IszY*Nfr; long long start = get_time(); //original particle centroid double xe = roundDouble(IszY/2.0); double ye = roundDouble(IszX/2.0); //expected object locations, compared to center int radius = 5; int diameter = radius*2 - 1; int * disk = (int *)malloc(diameter*diameter*sizeof(int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for(x = 0; x < diameter; x++){ for(y = 0; y < diameter; y++){ if(disk[x*diameter + y] == 1) countOnes++; } } double * objxy = (double *)malloc(countOnes*2*sizeof(double)); getneighbors(disk, countOnes, objxy, radius); long long get_neighbors = get_time(); printf("TIME TO GET NEIGHBORS TOOK: %f\n", elapsed_time(start, get_neighbors)); //initial weights are all equal (1/Nparticles) double * weights = (double *)malloc(sizeof(double)*Nparticles); #pragma omp parallel for shared(weights, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = 1/((double)(Nparticles)); } long long get_weights = get_time(); printf("TIME TO GET WEIGHTSTOOK: %f\n", elapsed_time(get_neighbors, get_weights)); //initial likelihood to 0.0 double * likelihood = (double *)malloc(sizeof(double)*Nparticles); double * arrayX = (double *)malloc(sizeof(double)*Nparticles); double * arrayY = (double *)malloc(sizeof(double)*Nparticles); double * xj = (double *)malloc(sizeof(double)*Nparticles); double * yj = (double *)malloc(sizeof(double)*Nparticles); double * CDF = (double *)malloc(sizeof(double)*Nparticles); double * u = (double *)malloc(sizeof(double)*Nparticles); int * ind = (int*)malloc(sizeof(int)*countOnes*Nparticles); #pragma omp parallel for shared(arrayX, arrayY, xe, ye) private(x) for(x = 0; x < Nparticles; x++){ arrayX[x] = xe; arrayY[x] = ye; } int k; printf("TIME TO SET ARRAYS TOOK: %f\n", elapsed_time(get_weights, get_time())); int indX, indY; for(k = 1; k < Nfr; k++){ long long set_arrays = get_time(); //apply motion model //draws sample from motion model (random walk). The only prior information //is that the object moves 2x as fast as in the y direction #pragma omp parallel for shared(arrayX, arrayY, Nparticles, seed) private(x) for(x = 0; x < Nparticles; x++){ arrayX[x] += 1 + 5*randn(seed, x); arrayY[x] += -2 + 2*randn(seed, x); } long long error = get_time(); printf("TIME TO SET ERROR TOOK: %f\n", elapsed_time(set_arrays, error)); //particle filter likelihood #pragma omp parallel for shared(likelihood, I, arrayX, arrayY, objxy, ind) private(x, y, indX, indY) for(x = 0; x < Nparticles; x++){ //compute the likelihood: remember our assumption is that you know // foreground and the background image intensity distribution. // Notice that we consider here a likelihood ratio, instead of // p(z|x). It is possible in this case. why? a hometask for you. //calc ind for(y = 0; y < countOnes; y++){ indX = roundDouble(arrayX[x]) + objxy[y*2 + 1]; indY = roundDouble(arrayY[x]) + objxy[y*2]; ind[x*countOnes + y] = fabs(indX*IszY*Nfr + indY*Nfr + k); if(ind[x*countOnes + y] >= max_size) ind[x*countOnes + y] = 0; } likelihood[x] = 0; for(y = 0; y < countOnes; y++) likelihood[x] += (pow((I[ind[x*countOnes + y]] - 100),2) - pow((I[ind[x*countOnes + y]]-228),2))/50.0; likelihood[x] = likelihood[x]/((double) countOnes); } long long likelihood_time = get_time(); printf("TIME TO GET LIKELIHOODS TOOK: %f\n", elapsed_time(error, likelihood_time)); // update & normalize weights // using equation (63) of Arulampalam Tutorial #pragma omp parallel for shared(Nparticles, weights, likelihood) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = weights[x] * exp(likelihood[x]); } long long exponential = get_time(); printf("TIME TO GET EXP TOOK: %f\n", elapsed_time(likelihood_time, exponential)); double sumWeights = 0; #pragma omp parallel for private(x) reduction(+:sumWeights) for(x = 0; x < Nparticles; x++){ sumWeights += weights[x]; } long long sum_time = get_time(); printf("TIME TO SUM WEIGHTS TOOK: %f\n", elapsed_time(exponential, sum_time)); #pragma omp parallel for shared(sumWeights, weights) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = weights[x]/sumWeights; } long long normalize = get_time(); printf("TIME TO NORMALIZE WEIGHTS TOOK: %f\n", elapsed_time(sum_time, normalize)); xe = 0; ye = 0; // estimate the object location by expected values #pragma omp parallel for private(x) reduction(+:xe, ye) for(x = 0; x < Nparticles; x++){ xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } long long move_time = get_time(); printf("TIME TO MOVE OBJECT TOOK: %f\n", elapsed_time(normalize, move_time)); printf("XE: %lf\n", xe); printf("YE: %lf\n", ye); double distance = sqrt( pow((double)(xe-(int)roundDouble(IszY/2.0)),2) + pow((double)(ye-(int)roundDouble(IszX/2.0)),2) ); printf("%lf\n", distance); //display(hold off for now) //pause(hold off for now) //resampling CDF[0] = weights[0]; for(x = 1; x < Nparticles; x++){ CDF[x] = weights[x] + CDF[x-1]; } long long cum_sum = get_time(); printf("TIME TO CALC CUM SUM TOOK: %f\n", elapsed_time(move_time, cum_sum)); double u1 = (1/((double)(Nparticles)))*randu(seed, 0); #pragma omp parallel for shared(u, u1, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ u[x] = u1 + x/((double)(Nparticles)); } long long u_time = get_time(); printf("TIME TO CALC U TOOK: %f\n", elapsed_time(cum_sum, u_time)); int j, i; #pragma omp parallel for shared(CDF, Nparticles, xj, yj, u, arrayX, arrayY) private(i, j) for(j = 0; j < Nparticles; j++){ i = findIndex(CDF, Nparticles, u[j]); if(i == -1) i = Nparticles-1; xj[j] = arrayX[i]; yj[j] = arrayY[i]; } long long xyj_time = get_time(); printf("TIME TO CALC NEW ARRAY X AND Y TOOK: %f\n", elapsed_time(u_time, xyj_time)); //reassign arrayX and arrayY arrayX = xj; arrayY = yj; //#pragma omp parallel for shared(weights, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = 1/((double)(Nparticles)); } long long reset = get_time(); printf("TIME TO RESET WEIGHTS TOOK: %f\n", elapsed_time(xyj_time, reset)); } free(disk); free(objxy); free(weights); free(likelihood); free(arrayX); free(arrayY); free(CDF); free(u); free(ind); }
/** * The implementation of the particle filter using OpenMP for many frames * @see http://openmp.org/wp/ * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @param I The video to be run * @param IszX The x dimension of the video * @param IszY The y dimension of the video * @param Nfr The number of frames * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used */ int particleFilter(int * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles){ int i, c; #ifdef HW XFcuda xcore; int Status; Status = XFcuda_Initialize(&xcore, 0); if (Status != XST_SUCCESS) { printf("Initialization failed\n"); return 1; // XST_FAILURE; } #endif int max_size = IszX*IszY*Nfr; //long long start = get_time(); //original particle centroid double xe = roundDouble(IszY/2.0); double ye = roundDouble(IszX/2.0); //expected object locations, compared to center int radius = 5; int diameter = radius*2 - 1; int * disk = (int *)malloc(diameter*diameter*sizeof(int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for(x = 0; x < diameter; x++){ for(y = 0; y < diameter; y++){ if(disk[x*diameter + y] == 1) countOnes++; } } double * objxy = (double *)malloc(countOnes*2*sizeof(double)); getneighbors(disk, countOnes, objxy, radius); //long long get_neighbors = get_time(); //printf("TIME TO GET NEIGHBORS TOOK: %f\n", elapsed_time(start, get_neighbors)); //initial weights are all equal (1/Nparticles) double * weights = (double *)malloc(sizeof(double)*Nparticles); for(x = 0; x < Nparticles; x++){ weights[x] = 1/((double)(Nparticles)); } //long long get_weights = get_time(); //printf("TIME TO GET WEIGHTSTOOK: %f\n", elapsed_time(get_neighbors, get_weights)); //initial likelihood to 0.0 //printf("%d\n", Nparticles); double * likelihood = (double *)malloc(sizeof(double)*Nparticles); double * arrayX = (double *)malloc(sizeof(double)*Nparticles); double * arrayY = (double *)malloc(sizeof(double)*Nparticles); double * xj = (double *)malloc(sizeof(double)*Nparticles); double * yj = (double *)malloc(sizeof(double)*Nparticles); double * CDF = (double *)malloc(sizeof(double)*Nparticles); //GPU copies of arrays //double * arrayX_GPU; //double * arrayY_GPU; //double * xj_GPU; //double * yj_GPU; //double * CDF_GPU; int * ind = (int*)malloc(sizeof(int)*countOnes); double * u = (double *)malloc(sizeof(double)*Nparticles); //double * u_GPU; //CUDA memory allocation //check_error(cudaMalloc((void **) &arrayX_GPU, sizeof(double)*Nparticles)); //arrayX_GPU = (double*)malloc(sizeof(double)*Nparticles); //check_error(cudaMalloc((void **) &arrayY_GPU, sizeof(double)*Nparticles)); //arrayY_GPU = (double*)malloc(sizeof(double)*Nparticles); //check_error(cudaMalloc((void **) &xj_GPU, sizeof(double)*Nparticles)); //xj_GPU = (double*)malloc(sizeof(double)*Nparticles); //check_error(cudaMalloc((void **) &yj_GPU, sizeof(double)*Nparticles)); //yj_GPU = (double*)malloc(sizeof(double)*Nparticles); //check_error(cudaMalloc((void **) &CDF_GPU, sizeof(double)*Nparticles)); //CDF_GPU = (double*)malloc(sizeof(double)*Nparticles); //check_error(cudaMalloc((void **) &u_GPU, sizeof(double)*Nparticles)); //u_GPU = (double*)malloc(sizeof(double)*Nparticles); for(x = 0; x < Nparticles; x++){ arrayX[x] = xe; arrayY[x] = ye; } //Set number of threads int num_blocks = ceil((double) Nparticles/(double) threads_per_block); //printf("%d\n", num_blocks); dim3 grids, threads; grids.x = num_blocks; grids.y = 1; grids.z = 1; threads.x = threads_per_block; threads.y = 1; threads.z = 1; #ifdef HW XFcuda_SetNparticles(&xcore, Nparticles); XFcuda_SetGriddim_x(&xcore, grids.x); XFcuda_SetGriddim_y(&xcore, grids.y); //XFcuda_SetGriddim_z(&xcore, grids.z); XFcuda_SetBlockdim_x(&xcore, threads.x); //XFcuda_SetBlockdim_y(&xcore, threads.y); //XFcuda_SetBlockdim_z(&xcore, threads.z); XFcuda_SetArrayx_addr(&xcore, (u32)arrayX / sizeof(double)); XFcuda_SetArrayy_addr(&xcore, (u32)arrayY / sizeof(double)); XFcuda_SetCdf_addr(&xcore, (u32)CDF / sizeof(double)); XFcuda_SetU_addr(&xcore, (u32)u / sizeof(double)); XFcuda_SetXj_addr(&xcore, (u32)xj / sizeof(double)); XFcuda_SetYj_addr(&xcore, (u32)yj / sizeof(double)); #endif int k; //double * Ik = (double *)malloc(sizeof(double)*IszX*IszY); int indX, indY; double *result = (double *)malloc(3 * (Nfr - 1) * sizeof(double)); i = 0; for(k = 1; k < Nfr; k++){ //long long set_arrays = get_time(); //printf("TIME TO SET ARRAYS TOOK: %f\n", elapsed_time(get_weights, set_arrays)); //apply motion model //draws sample from motion model (random walk). The only prior information //is that the object moves 2x as fast as in the y direction for(x = 0; x < Nparticles; x++){ arrayX[x] = arrayX[x] + 1.0 + 5.0*randn(seed, x); arrayY[x] = arrayY[x] - 2.0 + 2.0*randn(seed, x); } //particle filter likelihood //long long error = get_time(); //printf("TIME TO SET ERROR TOOK: %f\n", elapsed_time(set_arrays, error)); for(x = 0; x < Nparticles; x++){ //compute the likelihood: remember our assumption is that you know // foreground and the background image intensity distribution. // Notice that we consider here a likelihood ratio, instead of // p(z|x). It is possible in this case. why? a hometask for you. //calc ind for(y = 0; y < countOnes; y++){ indX = roundDouble(arrayX[x]) + objxy[y*2 + 1]; indY = roundDouble(arrayY[x]) + objxy[y*2]; ind[y] = fabs(indX*IszY*Nfr + indY*Nfr + k); if(ind[y] >= max_size) ind[y] = 0; } likelihood[x] = calcLikelihoodSum(I, ind, countOnes); likelihood[x] = likelihood[x]/countOnes; } //long long likelihood_time = get_time(); //printf("TIME TO GET LIKELIHOODS TOOK: %f\n", elapsed_time(error, likelihood_time)); // update & normalize weights // using equation (63) of Arulampalam Tutorial for(x = 0; x < Nparticles; x++){ weights[x] = weights[x] * exp(likelihood[x]); } //long long exponential = get_time(); //printf("TIME TO GET EXP TOOK: %f\n", elapsed_time(likelihood_time, exponential)); double sumWeights = 0; for(x = 0; x < Nparticles; x++){ sumWeights += weights[x]; } //long long sum_time = get_time(); //printf("TIME TO SUM WEIGHTS TOOK: %f\n", elapsed_time(exponential, sum_time)); for(x = 0; x < Nparticles; x++){ weights[x] = weights[x]/sumWeights; } //long long normalize = get_time(); //printf("TIME TO NORMALIZE WEIGHTS TOOK: %f\n", elapsed_time(sum_time, normalize)); xe = 0; ye = 0; // estimate the object location by expected values for(x = 0; x < Nparticles; x++){ //printf("%f %f %f\n", arrayX[x], arrayY[x], weights[x]); xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } //long long move_time = get_time(); //printf("TIME TO MOVE OBJECT TOOK: %f\n", elapsed_time(normalize, move_time)); //printf("XE: %lf\n", xe); //printf("YE: %lf\n", ye); double distance = sqrt( pow((double)(xe-(int)roundDouble(IszY/2.0)),2) + pow((double)(ye-(int)roundDouble(IszX/2.0)),2) ); //printf("%lf\n", distance); result[i] = xe; result[i + 1] = ye; result[i + 2] = distance; i += 3; //display(hold off for now) //pause(hold off for now) //resampling CDF[0] = weights[0]; for(x = 1; x < Nparticles; x++){ CDF[x] = weights[x] + CDF[x-1]; } //long long cum_sum = get_time(); //printf("TIME TO CALC CUM SUM TOOK: %f\n", elapsed_time(move_time, cum_sum)); double u1 = (1/((double)(Nparticles)))*randu(seed, 0); for(x = 0; x < Nparticles; x++){ u[x] = u1 + x/((double)(Nparticles)); } //long long u_time = get_time(); //printf("TIME TO CALC U TOOK: %f\n", elapsed_time(cum_sum, u_time)); //long long start_copy = get_time(); //CUDA memory copying from CPU memory to GPU memory //cudaMemcpy(arrayX_GPU, arrayX, sizeof(double)*Nparticles, cudaMemcpyHostToDevice); //memcpy(arrayX_GPU, arrayX, sizeof(double)*Nparticles); //cudaMemcpy(arrayY_GPU, arrayY, sizeof(double)*Nparticles, cudaMemcpyHostToDevice); //memcpy(arrayY_GPU, arrayY, sizeof(double)*Nparticles); //cudaMemcpy(xj_GPU, xj, sizeof(double)*Nparticles, cudaMemcpyHostToDevice); //memcpy(xj_GPU, xj, sizeof(double)*Nparticles); //cudaMemcpy(yj_GPU, yj, sizeof(double)*Nparticles, cudaMemcpyHostToDevice); //memcpy(yj_GPU, yj, sizeof(double)*Nparticles); //cudaMemcpy(CDF_GPU, CDF, sizeof(double)*Nparticles, cudaMemcpyHostToDevice); //memcpy(CDF_GPU, CDF, sizeof(double)*Nparticles); //cudaMemcpy(u_GPU, u, sizeof(double)*Nparticles, cudaMemcpyHostToDevice); //memcpy(u_GPU, u, sizeof(double)*Nparticles); //long long end_copy = get_time(); //Xil_DCacheDisable(); #ifdef HW Xil_DCacheDisable(); XFcuda_SetEn_fcuda1(&xcore, 1); XFcuda_Start(&xcore); while (!XFcuda_IsDone(&xcore)); Xil_DCacheEnable(); #else //KERNEL FUNCTION CALL int j; for(j = 0; j < Nparticles; j++){ x = findIndex(CDF, Nparticles, u[j]); if(x == -1) x = Nparticles-1; xj[j] = arrayX[x]; yj[j] = arrayY[x]; } #endif //cudaThreadSynchronize(); //long long start_copy_back = get_time(); //CUDA memory copying back from GPU to CPU memory //cudaMemcpy(yj, yj_GPU, sizeof(double)*Nparticles, cudaMemcpyDeviceToHost); //memcpy(yj, yj_GPU, sizeof(double)*Nparticles); //cudaMemcpy(xj, xj_GPU, sizeof(double)*Nparticles, cudaMemcpyDeviceToHost); //memcpy(xj, xj_GPU, sizeof(double)*Nparticles); //long long end_copy_back = get_time(); //printf("SENDING TO GPU TOOK: %lf\n", elapsed_time(start_copy, end_copy)); //printf("CUDA EXEC TOOK: %lf\n", elapsed_time(end_copy, start_copy_back)); //printf("SENDING BACK FROM GPU TOOK: %lf\n", elapsed_time(start_copy_back, end_copy_back)); //long long xyj_time = get_time(); //printf("TIME TO CALC NEW ARRAY X AND Y TOOK: %f\n", elapsed_time(u_time, xyj_time)); for(x = 0; x < Nparticles; x++){ //reassign arrayX and arrayY arrayX[x] = xj[x]; arrayY[x] = yj[x]; weights[x] = 1/((double)(Nparticles)); } //long long reset = get_time(); //printf("TIME TO RESET WEIGHTS TOOK: %f\n", elapsed_time(xyj_time, reset)); } //CUDA freeing of memory /*cudaFree(u_GPU); cudaFree(CDF_GPU); cudaFree(yj_GPU); cudaFree(xj_GPU); cudaFree(arrayY_GPU); cudaFree(arrayX_GPU);*/ /* free(u_GPU); free(CDF_GPU); free(yj_GPU); free(xj_GPU); free(arrayY_GPU); free(arrayX_GPU); */ //free memory free(disk); free(objxy); free(weights); free(likelihood); free(arrayX); free(arrayY); free(xj); free(yj); free(CDF); free(u); free(ind); /* FILE *fp = fopen("cuda/gold_output_naive.txt", "r"); if (fp == NULL) { printf("Cannot open file.\n"); free(result); return 0; } char buffer[50]; double gold_val; for (i = 0; i < 3 * (Nfr - 1); i++) { if (feof(fp)) { printf("Unexpected end of file.\n"); free(result); return 0; } fgets(buffer, sizeof(buffer), fp); sscanf(buffer, "%lf\n", &gold_val); if (gold_val - result[i] < -EPSILON || gold_val - result[i] > EPSILON) { printf("Mismatch result at %i: gold = %f, result = %f\n", i, gold_val, result[i]); free(result); return 0; } } free(result); */ #ifdef VERIFY for (i = 0; i < 3 * (Nfr - 1); i++) { printf("index %d: %f\n", i, result[i]); } for (i = 0; i < 3 * (Nfr - 1); i++) { if (fabs(gold_output[i] - result[i]) > EPSILON) { printf("Mismatch result at %i: gold = %f, result = %f\n", i, gold_output[i], result[i]); free(result); return 0; } } #endif free(result); return 1; }
/** * The implementation of the particle filter using OpenMP for a single image * @see http://openmp.org/wp/ * @note This function is designed to work with a single image. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @warning Use the other particle filter function for videos; the accuracy of this function decreases significantly as it is called repeatedly while processing video * @param I The image to be run * @param IszX The x dimension of the image * @param IszY The y dimension of the image * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used * @param x_loc The array that will store the x locations of the desired object * @param y_loc The array that will store the y locations of the desired object * @param prevX The starting x position of the object * @param prevY The starting y position of the object */ void particleFilter1F(int * I, int IszX, int IszY, int * seed, int Nparticles, double * x_loc, double * y_loc, double prevX, double prevY){ int max_size = IszX*IszY; /*original particle centroid*/ double xe = prevX; double ye = prevY; /*expected object locations, compared to center*/ int radius = 5; int diameter = radius*2 -1; int * disk = (int *)mxCalloc(diameter*diameter, sizeof(int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for(x = 0; x < diameter; x++){ for(y = 0; y < diameter; y++){ if(disk[x*diameter + y] == 1) countOnes++; } } double * objxy = (double *)mxCalloc(countOnes*2, sizeof(double)); getneighbors(disk, countOnes, objxy, radius); /*initial weights are all equal (1/Nparticles)*/ double * weights = (double *)mxCalloc(Nparticles, sizeof(double)); #pragma omp parallel for shared(weights, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = 1/((double)(Nparticles)); } /*initial likelihood to 0.0*/ double * likelihood = (double *)mxCalloc(Nparticles, sizeof(double)); double * arrayX = (double *)mxCalloc(Nparticles, sizeof(double)); double * arrayY = (double *)mxCalloc(Nparticles, sizeof(double)); double * xj = (double *)mxCalloc(Nparticles, sizeof(double)); double * yj = (double *)mxCalloc(Nparticles, sizeof(double)); double * CDF = (double *)mxCalloc(Nparticles, sizeof(double)); double * u = (double *)mxCalloc(Nparticles, sizeof(double)); mxArray * arguments[4]; mxArray * mxIK = mxCreateDoubleMatrix(IszX, IszY, mxREAL); mxArray * mxObj = mxCreateDoubleMatrix(countOnes, 2, mxREAL); mxArray * mxX = mxCreateDoubleMatrix(1, Nparticles, mxREAL); mxArray * mxY = mxCreateDoubleMatrix(1, Nparticles, mxREAL); double * Ik = (double *)mxCalloc(IszX*IszY, sizeof(double)); mxArray * result = mxCreateDoubleMatrix(1, Nparticles, mxREAL); #pragma omp parallel for shared(arrayX, arrayY, xe, ye) private(x) for(x = 0; x < Nparticles; x++){ arrayX[x] = xe; arrayY[x] = ye; } int k; int indX, indY; /*apply motion model //draws sample from motion model (random walk). The only prior information //is that the object moves 2x as fast as in the y direction*/ #pragma omp parallel for shared(arrayX, arrayY, Nparticles, seed) private(x) for(x = 0; x < Nparticles; x++){ arrayX[x] += 1 + 5*randn(seed, x); arrayY[x] += -2 + 2*randn(seed, x); } /*particle filter likelihood*/ //get the current image for(x = 0; x < IszX; x++) { for(y = 0; y < IszY; y++) { Ik[x*IszX + y] = (double)I[x*IszY + y]; } } //copy arguments memcpy(mxGetPr(mxIK), Ik, sizeof(double)*IszX*IszY); memcpy(mxGetPr(mxObj), objxy, sizeof(double)*countOnes); memcpy(mxGetPr(mxX), arrayX, sizeof(double)*Nparticles); memcpy(mxGetPr(mxY), arrayY, sizeof(double)*Nparticles); arguments[0] = mxIK; arguments[1] = mxObj; arguments[2] = mxX; arguments[3] = mxY; mexCallMATLAB(1, &result, 4, arguments, "GetSimpleLikelihood"); memcpy(likelihood, result, sizeof(double)*Nparticles); /* update & normalize weights // using equation (63) of Arulampalam Tutorial*/ #pragma omp parallel for shared(Nparticles, weights, likelihood) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = weights[x] * exp(likelihood[x]); } double sumWeights = 0; #pragma omp parallel for private(x) reduction(+:sumWeights) for(x = 0; x < Nparticles; x++){ sumWeights += weights[x]; } #pragma omp parallel for shared(sumWeights, weights) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = weights[x]/sumWeights; } xe = 0; ye = 0; /* estimate the object location by expected values*/ #pragma omp parallel for private(x) reduction(+:xe, ye) for(x = 0; x < Nparticles; x++){ xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } x_loc[0] = xe+.5; y_loc[0] = ye+.5; /*display(hold off for now) //pause(hold off for now) //resampling*/ CDF[0] = weights[0]; for(x = 1; x < Nparticles; x++){ CDF[x] = weights[x] + CDF[x-1]; } double u1 = (1/((double)(Nparticles)))*randu(seed, 0); #pragma omp parallel for shared(u, u1, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ u[x] = u1 + x/((double)(Nparticles)); } int j, i; #pragma omp parallel for shared(CDF, Nparticles, xj, yj, u, arrayX, arrayY) private(i, j) for(j = 0; j < Nparticles; j++){ i = findIndex(CDF, Nparticles, u[j]); /*i = findIndexBin(CDF, 0, Nparticles, u[j]);*/ if(i == -1) i = Nparticles-1; xj[j] = arrayX[i]; yj[j] = arrayY[i]; } /*reassign arrayX and arrayY*/ #pragma omp parallel for shared(weights, arrayX, arrayY, xj, yj, Nparticles) private(x) for(x = 0; x < Nparticles; x++){ weights[x] = 1/((double)(Nparticles)); arrayX[x] = xj[x]; arrayY[x] = yj[x]; } mxFree(disk); mxFree(weights); mxFree(objxy); mxFree(likelihood); mxFree(arrayX); mxFree(arrayY); mxFree(CDF); mxFree(u); mxFree(xj); mxFree(yj); mxFree(Ik); }
/** * The implementation of the particle filter using OpenMP for many frames * @see http://openmp.org/wp/ * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @param I The video to be run * @param IszX The x dimension of the video * @param IszY The y dimension of the video * @param Nfr The number of frames * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used */ int particleFilter(unsigned char * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles) { int max_size = IszX * IszY*Nfr; //original particle centroid double xe = roundDouble(IszY / 2.0); double ye = roundDouble(IszX / 2.0); //expected object locations, compared to center int radius = 5; int diameter = radius * 2 - 1; int * disk = (int*) calloc(diameter * diameter, sizeof (int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for (x = 0; x < diameter; x++) { for (y = 0; y < diameter; y++) { if (disk[x * diameter + y] == 1) countOnes++; } } int * objxy = (int *) calloc(countOnes * 2, sizeof(int)); getneighbors(disk, countOnes, objxy, radius); //initial weights are all equal (1/Nparticles) double * weights = (double *) calloc(Nparticles, sizeof(double)); for (x = 0; x < Nparticles; x++) { weights[x] = 1 / ((double) (Nparticles)); } /**************************************************************** ************** B E G I N A L L O C A T E ******************* ****************************************************************/ /***** kernel variables ******/ cl_kernel kernel_likelihood; cl_kernel kernel_sum; cl_kernel kernel_normalize_weights; cl_kernel kernel_find_index; int sourcesize = 2048 * 2048; char * source = (char *) calloc(sourcesize, sizeof (char)); if (!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * tempchar = "./particle_double.cl"; FILE * fp = fopen(tempchar, "rb"); if (!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // OpenCL initialization int use_gpu = 1; if (initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = {source, 0}; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = clBuildProgram(prog, 1, device_list, "-cl-fast-relaxed-math", NULL, NULL); if (err != CL_SUCCESS) { if (err == CL_INVALID_PROGRAM) printf("CL_INVALID_PROGRAM\n"); else if (err == CL_INVALID_VALUE) printf("CL_INVALID_VALUE\n"); else if (err == CL_INVALID_DEVICE) printf("CL_INVALID_DEVICE\n"); else if (err == CL_INVALID_BINARY) printf("CL_INVALID_BINARY\n"); else if (err == CL_INVALID_BUILD_OPTIONS) printf("CL_INVALID_BUILD_OPTIONS\n"); else if (err == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n"); else if (err == CL_COMPILER_NOT_AVAILABLE) printf("CL_COMPILER_NOT_AVAILABLE\n"); else if (err == CL_BUILD_PROGRAM_FAILURE) printf("CL_BUILD_PROGRAM_FAILURE\n"); else if (err == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n"); else if (err == CL_OUT_OF_RESOURCES) printf("CL_OUT_OF_RESOURCES\n"); else if (err == CL_OUT_OF_HOST_MEMORY) printf("CL_OUT_OF_HOST_MEMORY\n"); printf("ERROR: clBuildProgram() => %d\n", err); static char log[65536]; memset(log, 0, sizeof (log)); err = clGetProgramBuildInfo(prog, device_list[0], CL_PROGRAM_BUILD_LOG, sizeof (log) - 1, log, NULL); if (err != CL_SUCCESS) { printf("ERROR: clGetProgramBuildInfo() => %d\n", err); } if (strstr(log, "warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } // { // show warnings/errors // static char log[65536]; // memset(log, 0, sizeof (log)); // cl_device_id device_id[2] = {0}; // err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof (device_id), device_id, NULL); // if (err != CL_SUCCESS) { // if (err == CL_INVALID_CONTEXT) // printf("ERROR: clGetContextInfo() => CL_INVALID_CONTEXT\n"); // if (err == CL_INVALID_VALUE) // printf("ERROR: clGetContextInfo() => CL_INVALID_VALUE\n"); // } // }//*/ char * s_likelihood_kernel = "likelihood_kernel"; char * s_sum_kernel = "sum_kernel"; char * s_normalize_weights_kernel = "normalize_weights_kernel"; char * s_find_index_kernel = "find_index_kernel"; kernel_likelihood = clCreateKernel(prog, s_likelihood_kernel, &err); if (err != CL_SUCCESS) { if (err == CL_INVALID_PROGRAM) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM %d\n", err); if (err == CL_INVALID_PROGRAM_EXECUTABLE) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM EXECUTABLE %d\n", err); if (err == CL_INVALID_KERNEL_NAME) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL NAME %d\n", err); if (err == CL_INVALID_KERNEL_DEFINITION) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL DEFINITION %d\n", err); if (err == CL_INVALID_VALUE) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID CL_INVALID_VALUE %d\n", err); printf("ERROR: clCreateKernel(likelihood_kernel) failed.\n"); return -1; } kernel_sum = clCreateKernel(prog, s_sum_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(sum_kernel) 0 => %d\n", err); return -1; } kernel_normalize_weights = clCreateKernel(prog, s_normalize_weights_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(normalize_weights_kernel) 0 => %d\n", err); return -1; } kernel_find_index = clCreateKernel(prog, s_find_index_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(find_index_kernel) 0 => %d\n", err); return -1; } //initial likelihood to 0.0 double * likelihood = (double *) calloc(Nparticles + 1, sizeof (double)); double * arrayX = (double *) calloc(Nparticles, sizeof (double)); double * arrayY = (double *) calloc(Nparticles, sizeof (double)); double * xj = (double *) calloc(Nparticles, sizeof (double)); double * yj = (double *) calloc(Nparticles, sizeof (double)); double * CDF = (double *) calloc(Nparticles, sizeof(double)); //GPU copies of arrays cl_mem arrayX_GPU; cl_mem arrayY_GPU; cl_mem xj_GPU; cl_mem yj_GPU; cl_mem CDF_GPU; cl_mem likelihood_GPU; cl_mem I_GPU; cl_mem weights_GPU; cl_mem objxy_GPU; int * ind = (int*) calloc(countOnes, sizeof(int)); cl_mem ind_GPU; double * u = (double *) calloc(Nparticles, sizeof(double)); cl_mem u_GPU; cl_mem seed_GPU; cl_mem partial_sums; //OpenCL memory allocation arrayX_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err); return -1; } arrayY_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err); return -1; } xj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer xj_GPU (size:%d) => %d\n", Nparticles, err); return -1; } yj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer yj_GPU (size:%d) => %d\n", Nparticles, err); return -1; } CDF_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) * Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer CDF_GPU (size:%d) => %d\n", Nparticles, err); return -1; } u_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer u_GPU (size:%d) => %d\n", Nparticles, err); return -1; } likelihood_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer likelihood_GPU (size:%d) => %d\n", Nparticles, err); return -1; } weights_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer weights_GPU (size:%d) => %d\n", Nparticles, err); return -1; } I_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (unsigned char) *IszX * IszY * Nfr, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer I_GPU (size:%d) => %d\n", IszX * IszY * Nfr, err); return -1; } objxy_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*sizeof (int) *countOnes, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer objxy_GPU (size:%d) => %d\n", countOnes, err); return -1; } ind_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *countOnes * Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer ind_GPU (size:%d) => %d\n", countOnes * Nparticles, err); return -1; } seed_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer seed_GPU (size:%d) => %d\n", Nparticles, err); return -1; } partial_sums = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof (double) * Nparticles + 1, likelihood, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer partial_sums (size:%d) => %d\n", Nparticles, err); return -1; } //Donnie - this loop is different because in this kernel, arrayX and arrayY // are set equal to xj before every iteration, so effectively, arrayX and // arrayY will be set to xe and ye before the first iteration. for (x = 0; x < Nparticles; x++) { xj[x] = xe; yj[x] = ye; } int k; //double * Ik = (double *)calloc(IszX*IszY, sizeof(double)); int indX, indY; //start send long long send_start = get_time(); //OpenCL memory copy err = clEnqueueWriteBuffer(cmd_queue, I_GPU, 1, 0, sizeof (unsigned char) *IszX * IszY*Nfr, I, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer I_GPU (size:%d) => %d\n", IszX * IszY*Nfr, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, objxy_GPU, 1, 0, 2*sizeof (int) *countOnes, objxy, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer objxy_GPU (size:%d) => %d\n", countOnes, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer weights_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, xj_GPU, 1, 0, sizeof (double) *Nparticles, xj, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, yj_GPU, 1, 0, sizeof (double) *Nparticles, yj, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, seed_GPU, 1, 0, sizeof (int) *Nparticles, seed, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer seed_GPU (size:%d) => %d\n", Nparticles, err); return -1; } /********************************************************************** *********** E N D A L L O C A T E ******************************** *********************************************************************/ long long send_end = get_time(); printf("TIME TO SEND TO GPU: %f\n", elapsed_time(send_start, send_end)); int num_blocks = ceil((double) Nparticles / (double) threads_per_block); printf("threads_per_block=%d \n",threads_per_block); size_t local_work[3] = {threads_per_block, 1, 1}; size_t global_work[3] = {num_blocks*threads_per_block, 1, 1}; for (k = 1; k < Nfr; k++) { /****************** L I K E L I H O O D ************************************/ clSetKernelArg(kernel_likelihood, 0, sizeof (void *), (void*) &arrayX_GPU); clSetKernelArg(kernel_likelihood, 1, sizeof (void *), (void*) &arrayY_GPU); clSetKernelArg(kernel_likelihood, 2, sizeof (void *), (void*) &xj_GPU); clSetKernelArg(kernel_likelihood, 3, sizeof (void *), (void*) &yj_GPU); clSetKernelArg(kernel_likelihood, 4, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_likelihood, 5, sizeof (void *), (void*) &ind_GPU); clSetKernelArg(kernel_likelihood, 6, sizeof (void *), (void*) &objxy_GPU); clSetKernelArg(kernel_likelihood, 7, sizeof (void *), (void*) &likelihood_GPU); clSetKernelArg(kernel_likelihood, 8, sizeof (void *), (void*) &I_GPU); clSetKernelArg(kernel_likelihood, 9, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_likelihood, 10, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_likelihood, 11, sizeof (cl_int), (void*) &Nparticles); clSetKernelArg(kernel_likelihood, 12, sizeof (cl_int), (void*) &countOnes); clSetKernelArg(kernel_likelihood, 13, sizeof (cl_int), (void*) &max_size); clSetKernelArg(kernel_likelihood, 14, sizeof (cl_int), (void*) &k); clSetKernelArg(kernel_likelihood, 15, sizeof (cl_int), (void*) &IszY); clSetKernelArg(kernel_likelihood, 16, sizeof (cl_int), (void*) &Nfr); clSetKernelArg(kernel_likelihood, 17, sizeof (void *), (void*) &seed_GPU); clSetKernelArg(kernel_likelihood, 18, sizeof (void *), (void*) &partial_sums); clSetKernelArg(kernel_likelihood, 19, threads_per_block * sizeof (double), NULL); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_likelihood, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(kernel_likelihood)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /****************** E N D L I K E L I H O O D **********************/ /*************************** S U M ************************************/ clSetKernelArg(kernel_sum, 0, sizeof (void *), (void*) &partial_sums); clSetKernelArg(kernel_sum, 1, sizeof (cl_int), (void*) &Nparticles); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_sum, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(kernel_sum)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; }/*************************** E N D S U M ****************************/ /**************** N O R M A L I Z E W E I G H T S *****************/ clSetKernelArg(kernel_normalize_weights, 0, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_normalize_weights, 1, sizeof (cl_int), (void*) &Nparticles); clSetKernelArg(kernel_normalize_weights, 2, sizeof (void *), (void*) &partial_sums); //*/ clSetKernelArg(kernel_normalize_weights, 3, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_normalize_weights, 4, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_normalize_weights, 5, sizeof (void *), (void*) &seed_GPU); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_normalize_weights, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(normalize_weights)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /************* E N D N O R M A L I Z E W E I G H T S ***********/ // ocl_print_double_array(cmd_queue, partial_sums, 40); // /********* I N T E R M E D I A T E R E S U L T S ***************/ // //OpenCL memory copying back from GPU to CPU memory err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0); err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0); err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); xe = 0; ye = 0; double total=0.0; // estimate the object location by expected values for (x = 0; x < Nparticles; x++) { // if( 0.0000000 < arrayX[x]*weights[x]) printf("arrayX[%d]:%f, arrayY[%d]:%f, weights[%d]:%0.10f\n",x,arrayX[x], x, arrayY[x], x, weights[x]); // printf("arrayX[%d]:%f | arrayY[%d]:%f | weights[%d]:%f\n", // x, arrayX[x], x, arrayY[x], x, weights[x]); xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; total+= weights[x]; } printf("total weight: %lf\n", total); printf("XE: %lf\n", xe); printf("YE: %lf\n", ye); double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2)); printf("%lf\n", distance); // /********* E N D I N T E R M E D I A T E R E S U L T S ***************/ /******************** F I N D I N D E X ****************************/ //Set number of threads clSetKernelArg(kernel_find_index, 0, sizeof (void *), (void*) &arrayX_GPU); clSetKernelArg(kernel_find_index, 1, sizeof (void *), (void*) &arrayY_GPU); clSetKernelArg(kernel_find_index, 2, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_find_index, 3, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_find_index, 4, sizeof (void *), (void*) &xj_GPU); clSetKernelArg(kernel_find_index, 5, sizeof (void *), (void*) &yj_GPU); clSetKernelArg(kernel_find_index, 6, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_find_index, 7, sizeof (cl_int), (void*) &Nparticles); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_find_index, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(find_index)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /******************* E N D F I N D I N D E X ********************/ }//end loop //block till kernels are finished //clFinish(cmd_queue); long long back_time = get_time(); //OpenCL freeing of memory clReleaseProgram(prog); clReleaseMemObject(u_GPU); clReleaseMemObject(CDF_GPU); clReleaseMemObject(yj_GPU); clReleaseMemObject(xj_GPU); clReleaseMemObject(likelihood_GPU); clReleaseMemObject(I_GPU); clReleaseMemObject(objxy_GPU); clReleaseMemObject(ind_GPU); clReleaseMemObject(seed_GPU); clReleaseMemObject(partial_sums); long long free_time = get_time(); //OpenCL memory copying back from GPU to CPU memory err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long arrayX_time = get_time(); err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long arrayY_time = get_time(); err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long back_end_time = get_time(); printf("GPU Execution: %lf\n", elapsed_time(send_end, back_time)); printf("FREE TIME: %lf\n", elapsed_time(back_time, free_time)); printf("SEND TO SEND BACK: %lf\n", elapsed_time(back_time, back_end_time)); printf("SEND ARRAY X BACK: %lf\n", elapsed_time(free_time, arrayX_time)); printf("SEND ARRAY Y BACK: %lf\n", elapsed_time(arrayX_time, arrayY_time)); printf("SEND WEIGHTS BACK: %lf\n", elapsed_time(arrayY_time, back_end_time)); xe = 0; ye = 0; // estimate the object location by expected values for (x = 0; x < Nparticles; x++) { xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2)); //Output results FILE *fid; fid=fopen("output.txt", "w+"); if( fid == NULL ) { printf( "The file was not opened for writing\n" ); return -1; } fprintf(fid, "XE: %lf\n", xe); fprintf(fid, "YE: %lf\n", ye); fprintf(fid, "distance: %lf\n", distance); fclose(fid); //OpenCL freeing of memory clReleaseMemObject(weights_GPU); clReleaseMemObject(arrayY_GPU); clReleaseMemObject(arrayX_GPU); //free regular memory free(likelihood); free(arrayX); free(arrayY); free(xj); free(yj); free(CDF); free(ind); free(u); }