/**
* The implementation of the particle filter using OpenMP for many frames
* @see http://openmp.org/wp/
* @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
* @param I The video to be run
* @param IszX The x dimension of the video
* @param IszY The y dimension of the video
* @param Nfr The number of frames
* @param seed The seed array used for random number generation
* @param Nparticles The number of particles to be used
*/
void particleFilter(int * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles){
	
	int max_size = IszX*IszY*Nfr;
	long long start = get_time();
	//original particle centroid
	double xe = roundDouble(IszY/2.0);
	double ye = roundDouble(IszX/2.0);
	
	//expected object locations, compared to center
	int radius = 5;
	int diameter = radius*2 - 1;
	int * disk = (int *)malloc(diameter*diameter*sizeof(int));
	strelDisk(disk, radius);
	int countOnes = 0;
	int x, y;
	for(x = 0; x < diameter; x++){
		for(y = 0; y < diameter; y++){
			if(disk[x*diameter + y] == 1)
				countOnes++;
		}
	}
	double * objxy = (double *)malloc(countOnes*2*sizeof(double));
	getneighbors(disk, countOnes, objxy, radius);
	
	long long get_neighbors = get_time();
	printf("TIME TO GET NEIGHBORS TOOK: %f\n", elapsed_time(start, get_neighbors));
	//initial weights are all equal (1/Nparticles)
	double * weights = (double *)malloc(sizeof(double)*Nparticles);
	#pragma omp parallel for shared(weights, Nparticles) private(x)
	for(x = 0; x < Nparticles; x++){
		weights[x] = 1/((double)(Nparticles));
	}
	long long get_weights = get_time();
	printf("TIME TO GET WEIGHTSTOOK: %f\n", elapsed_time(get_neighbors, get_weights));
	//initial likelihood to 0.0
	double * likelihood = (double *)malloc(sizeof(double)*Nparticles);
	double * arrayX = (double *)malloc(sizeof(double)*Nparticles);
	double * arrayY = (double *)malloc(sizeof(double)*Nparticles);
	double * xj = (double *)malloc(sizeof(double)*Nparticles);
	double * yj = (double *)malloc(sizeof(double)*Nparticles);
	double * CDF = (double *)malloc(sizeof(double)*Nparticles);
	double * u = (double *)malloc(sizeof(double)*Nparticles);
	int * ind = (int*)malloc(sizeof(int)*countOnes*Nparticles);
	#pragma omp parallel for shared(arrayX, arrayY, xe, ye) private(x)
	for(x = 0; x < Nparticles; x++){
		arrayX[x] = xe;
		arrayY[x] = ye;
	}
	int k;
	
	printf("TIME TO SET ARRAYS TOOK: %f\n", elapsed_time(get_weights, get_time()));
	int indX, indY;
	for(k = 1; k < Nfr; k++){
		long long set_arrays = get_time();
		//apply motion model
		//draws sample from motion model (random walk). The only prior information
		//is that the object moves 2x as fast as in the y direction
		#pragma omp parallel for shared(arrayX, arrayY, Nparticles, seed) private(x)
		for(x = 0; x < Nparticles; x++){
			arrayX[x] += 1 + 5*randn(seed, x);
			arrayY[x] += -2 + 2*randn(seed, x);
		}
		long long error = get_time();
		printf("TIME TO SET ERROR TOOK: %f\n", elapsed_time(set_arrays, error));
		//particle filter likelihood
		#pragma omp parallel for shared(likelihood, I, arrayX, arrayY, objxy, ind) private(x, y, indX, indY)
		for(x = 0; x < Nparticles; x++){
			//compute the likelihood: remember our assumption is that you know
			// foreground and the background image intensity distribution.
			// Notice that we consider here a likelihood ratio, instead of
			// p(z|x). It is possible in this case. why? a hometask for you.		
			//calc ind
			for(y = 0; y < countOnes; y++){
				indX = roundDouble(arrayX[x]) + objxy[y*2 + 1];
				indY = roundDouble(arrayY[x]) + objxy[y*2];
				ind[x*countOnes + y] = fabs(indX*IszY*Nfr + indY*Nfr + k);
				if(ind[x*countOnes + y] >= max_size)
					ind[x*countOnes + y] = 0;
			}
			likelihood[x] = 0;
			for(y = 0; y < countOnes; y++)
				likelihood[x] += (pow((I[ind[x*countOnes + y]] - 100),2) - pow((I[ind[x*countOnes + y]]-228),2))/50.0;
			likelihood[x] = likelihood[x]/((double) countOnes);
		}
		long long likelihood_time = get_time();
		printf("TIME TO GET LIKELIHOODS TOOK: %f\n", elapsed_time(error, likelihood_time));
		// update & normalize weights
		// using equation (63) of Arulampalam Tutorial
		#pragma omp parallel for shared(Nparticles, weights, likelihood) private(x)
		for(x = 0; x < Nparticles; x++){
			weights[x] = weights[x] * exp(likelihood[x]);
		}
		long long exponential = get_time();
		printf("TIME TO GET EXP TOOK: %f\n", elapsed_time(likelihood_time, exponential));
		double sumWeights = 0;
		#pragma omp parallel for private(x) reduction(+:sumWeights)
		for(x = 0; x < Nparticles; x++){
			sumWeights += weights[x];
		}
		long long sum_time = get_time();
		printf("TIME TO SUM WEIGHTS TOOK: %f\n", elapsed_time(exponential, sum_time));
		#pragma omp parallel for shared(sumWeights, weights) private(x)
		for(x = 0; x < Nparticles; x++){
			weights[x] = weights[x]/sumWeights;
		}
		long long normalize = get_time();
		printf("TIME TO NORMALIZE WEIGHTS TOOK: %f\n", elapsed_time(sum_time, normalize));
		xe = 0;
		ye = 0;
		// estimate the object location by expected values
		#pragma omp parallel for private(x) reduction(+:xe, ye)
		for(x = 0; x < Nparticles; x++){
			xe += arrayX[x] * weights[x];
			ye += arrayY[x] * weights[x];
		}
		long long move_time = get_time();
		printf("TIME TO MOVE OBJECT TOOK: %f\n", elapsed_time(normalize, move_time));
		printf("XE: %lf\n", xe);
		printf("YE: %lf\n", ye);
		double distance = sqrt( pow((double)(xe-(int)roundDouble(IszY/2.0)),2) + pow((double)(ye-(int)roundDouble(IszX/2.0)),2) );
		printf("%lf\n", distance);
		//display(hold off for now)
		
		//pause(hold off for now)
		
		//resampling
		
		
		CDF[0] = weights[0];
		for(x = 1; x < Nparticles; x++){
			CDF[x] = weights[x] + CDF[x-1];
		}
		long long cum_sum = get_time();
		printf("TIME TO CALC CUM SUM TOOK: %f\n", elapsed_time(move_time, cum_sum));
		double u1 = (1/((double)(Nparticles)))*randu(seed, 0);
		#pragma omp parallel for shared(u, u1, Nparticles) private(x)
		for(x = 0; x < Nparticles; x++){
			u[x] = u1 + x/((double)(Nparticles));
		}
		long long u_time = get_time();
		printf("TIME TO CALC U TOOK: %f\n", elapsed_time(cum_sum, u_time));
		int j, i;
		
		#pragma omp parallel for shared(CDF, Nparticles, xj, yj, u, arrayX, arrayY) private(i, j)
		for(j = 0; j < Nparticles; j++){
			i = findIndex(CDF, Nparticles, u[j]);
			if(i == -1)
				i = Nparticles-1;
			xj[j] = arrayX[i];
			yj[j] = arrayY[i];
			
		}
		long long xyj_time = get_time();
		printf("TIME TO CALC NEW ARRAY X AND Y TOOK: %f\n", elapsed_time(u_time, xyj_time));
		//reassign arrayX and arrayY
		arrayX = xj;
		arrayY = yj;
		//#pragma omp parallel for shared(weights, Nparticles) private(x)
		for(x = 0; x < Nparticles; x++){
			weights[x] = 1/((double)(Nparticles));
		}
		long long reset = get_time();
		printf("TIME TO RESET WEIGHTS TOOK: %f\n", elapsed_time(xyj_time, reset));
	}
	free(disk);
	free(objxy);
	free(weights);
	free(likelihood);
	free(arrayX);
	free(arrayY);
	free(CDF);
	free(u);
	free(ind);
}
/**
 * The implementation of the particle filter using OpenMP for many frames
 * @see http://openmp.org/wp/
 * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
 * @param I The video to be run
 * @param IszX The x dimension of the video
 * @param IszY The y dimension of the video
 * @param Nfr The number of frames
 * @param seed The seed array used for random number generation
 * @param Nparticles The number of particles to be used
 */
int particleFilter(int * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles){

  int i, c;

#ifdef HW
  XFcuda xcore;
  int Status;


  Status = XFcuda_Initialize(&xcore, 0);
  if (Status != XST_SUCCESS) {
    printf("Initialization failed\n");
    return 1; // XST_FAILURE;
  }
#endif
  int max_size = IszX*IszY*Nfr;
  //long long start = get_time();
  //original particle centroid
  double xe = roundDouble(IszY/2.0);
  double ye = roundDouble(IszX/2.0);

  //expected object locations, compared to center
  int radius = 5;
  int diameter = radius*2 - 1;
  int * disk = (int *)malloc(diameter*diameter*sizeof(int));
  strelDisk(disk, radius);
  int countOnes = 0;
  int x, y;
  for(x = 0; x < diameter; x++){
    for(y = 0; y < diameter; y++){
      if(disk[x*diameter + y] == 1)
        countOnes++;
    }
  }
  double * objxy = (double *)malloc(countOnes*2*sizeof(double));
  getneighbors(disk, countOnes, objxy, radius);

  //long long get_neighbors = get_time();
  //printf("TIME TO GET NEIGHBORS TOOK: %f\n", elapsed_time(start, get_neighbors));
  //initial weights are all equal (1/Nparticles)
  double * weights = (double *)malloc(sizeof(double)*Nparticles);
  for(x = 0; x < Nparticles; x++){
    weights[x] = 1/((double)(Nparticles));
  }
  //long long get_weights = get_time();
  //printf("TIME TO GET WEIGHTSTOOK: %f\n", elapsed_time(get_neighbors, get_weights));
  //initial likelihood to 0.0
  //printf("%d\n", Nparticles);
  double * likelihood = (double *)malloc(sizeof(double)*Nparticles);
  double * arrayX = (double *)malloc(sizeof(double)*Nparticles);
  double * arrayY = (double *)malloc(sizeof(double)*Nparticles);
  double * xj = (double *)malloc(sizeof(double)*Nparticles);
  double * yj = (double *)malloc(sizeof(double)*Nparticles);
  double * CDF = (double *)malloc(sizeof(double)*Nparticles);

  //GPU copies of arrays
  //double * arrayX_GPU;
  //double * arrayY_GPU;
  //double * xj_GPU;
  //double * yj_GPU;
  //double * CDF_GPU;

  int * ind = (int*)malloc(sizeof(int)*countOnes);
  double * u = (double *)malloc(sizeof(double)*Nparticles);
  //double * u_GPU;

  //CUDA memory allocation
  //check_error(cudaMalloc((void **) &arrayX_GPU, sizeof(double)*Nparticles));
  //arrayX_GPU = (double*)malloc(sizeof(double)*Nparticles);
  //check_error(cudaMalloc((void **) &arrayY_GPU, sizeof(double)*Nparticles));
  //arrayY_GPU = (double*)malloc(sizeof(double)*Nparticles);
  //check_error(cudaMalloc((void **) &xj_GPU, sizeof(double)*Nparticles));
  //xj_GPU = (double*)malloc(sizeof(double)*Nparticles);
  //check_error(cudaMalloc((void **) &yj_GPU, sizeof(double)*Nparticles));
  //yj_GPU = (double*)malloc(sizeof(double)*Nparticles);
  //check_error(cudaMalloc((void **) &CDF_GPU, sizeof(double)*Nparticles));
  //CDF_GPU = (double*)malloc(sizeof(double)*Nparticles);
  //check_error(cudaMalloc((void **) &u_GPU, sizeof(double)*Nparticles));
  //u_GPU = (double*)malloc(sizeof(double)*Nparticles);
  for(x = 0; x < Nparticles; x++){
    arrayX[x] = xe;
    arrayY[x] = ye;
  }

  //Set number of threads
  int num_blocks = ceil((double) Nparticles/(double) threads_per_block);
  //printf("%d\n", num_blocks);
  dim3 grids, threads;
  grids.x = num_blocks;
  grids.y = 1;
  grids.z = 1;
  threads.x = threads_per_block;
  threads.y = 1;
  threads.z = 1;

#ifdef HW
  XFcuda_SetNparticles(&xcore, Nparticles);
  XFcuda_SetGriddim_x(&xcore, grids.x);
  XFcuda_SetGriddim_y(&xcore, grids.y);
  //XFcuda_SetGriddim_z(&xcore, grids.z);
  XFcuda_SetBlockdim_x(&xcore, threads.x);
  //XFcuda_SetBlockdim_y(&xcore, threads.y);
  //XFcuda_SetBlockdim_z(&xcore, threads.z);
  XFcuda_SetArrayx_addr(&xcore, (u32)arrayX / sizeof(double));
  XFcuda_SetArrayy_addr(&xcore, (u32)arrayY / sizeof(double));
  XFcuda_SetCdf_addr(&xcore, (u32)CDF / sizeof(double));
  XFcuda_SetU_addr(&xcore, (u32)u / sizeof(double));
  XFcuda_SetXj_addr(&xcore, (u32)xj / sizeof(double));
  XFcuda_SetYj_addr(&xcore, (u32)yj / sizeof(double));

#endif

  int k;
  //double * Ik = (double *)malloc(sizeof(double)*IszX*IszY);
  int indX, indY;
  double *result = (double *)malloc(3 * (Nfr - 1) * sizeof(double));
  i = 0;
  for(k = 1; k < Nfr; k++){
    //long long set_arrays = get_time();
    //printf("TIME TO SET ARRAYS TOOK: %f\n", elapsed_time(get_weights, set_arrays));
    //apply motion model
    //draws sample from motion model (random walk). The only prior information
    //is that the object moves 2x as fast as in the y direction

    for(x = 0; x < Nparticles; x++){
      arrayX[x] = arrayX[x] + 1.0 + 5.0*randn(seed, x);
      arrayY[x] = arrayY[x] - 2.0 + 2.0*randn(seed, x);
    }

    //particle filter likelihood
    //long long error = get_time();
    //printf("TIME TO SET ERROR TOOK: %f\n", elapsed_time(set_arrays, error));
    for(x = 0; x < Nparticles; x++){

      //compute the likelihood: remember our assumption is that you know
      // foreground and the background image intensity distribution.
      // Notice that we consider here a likelihood ratio, instead of
      // p(z|x). It is possible in this case. why? a hometask for you.		
      //calc ind
      for(y = 0; y < countOnes; y++){
        indX = roundDouble(arrayX[x]) + objxy[y*2 + 1];
        indY = roundDouble(arrayY[x]) + objxy[y*2];
        ind[y] = fabs(indX*IszY*Nfr + indY*Nfr + k);
        if(ind[y] >= max_size)
          ind[y] = 0;
      }
      likelihood[x] = calcLikelihoodSum(I, ind, countOnes);
      likelihood[x] = likelihood[x]/countOnes;
    }
    //long long likelihood_time = get_time();
    //printf("TIME TO GET LIKELIHOODS TOOK: %f\n", elapsed_time(error, likelihood_time));
    // update & normalize weights
    // using equation (63) of Arulampalam Tutorial		
    for(x = 0; x < Nparticles; x++){
      weights[x] = weights[x] * exp(likelihood[x]);
    }
    //long long exponential = get_time();
    //printf("TIME TO GET EXP TOOK: %f\n", elapsed_time(likelihood_time, exponential));
    double sumWeights = 0;	
    for(x = 0; x < Nparticles; x++){
      sumWeights += weights[x];
    }
    //long long sum_time = get_time();
    //printf("TIME TO SUM WEIGHTS TOOK: %f\n", elapsed_time(exponential, sum_time));
    for(x = 0; x < Nparticles; x++){
      weights[x] = weights[x]/sumWeights;
    }
    //long long normalize = get_time();
    //printf("TIME TO NORMALIZE WEIGHTS TOOK: %f\n", elapsed_time(sum_time, normalize));
    xe = 0;
    ye = 0;
    // estimate the object location by expected values
    for(x = 0; x < Nparticles; x++){
      //printf("%f %f %f\n", arrayX[x], arrayY[x], weights[x]);
      xe += arrayX[x] * weights[x];
      ye += arrayY[x] * weights[x];
    }
    //long long move_time = get_time();
    //printf("TIME TO MOVE OBJECT TOOK: %f\n", elapsed_time(normalize, move_time));
    //printf("XE: %lf\n", xe);
    //printf("YE: %lf\n", ye);
    double distance = sqrt( pow((double)(xe-(int)roundDouble(IszY/2.0)),2) + pow((double)(ye-(int)roundDouble(IszX/2.0)),2) );
    //printf("%lf\n", distance);
    result[i] = xe;
    result[i + 1] = ye;
    result[i + 2] = distance;
    i += 3;
    //display(hold off for now)

    //pause(hold off for now)

    //resampling


    CDF[0] = weights[0];
    for(x = 1; x < Nparticles; x++){
      CDF[x] = weights[x] + CDF[x-1];
    }
    //long long cum_sum = get_time();
    //printf("TIME TO CALC CUM SUM TOOK: %f\n", elapsed_time(move_time, cum_sum));
    double u1 = (1/((double)(Nparticles)))*randu(seed, 0);
    for(x = 0; x < Nparticles; x++){
      u[x] = u1 + x/((double)(Nparticles));
    }
    //long long u_time = get_time();
    //printf("TIME TO CALC U TOOK: %f\n", elapsed_time(cum_sum, u_time));
    //long long start_copy = get_time();
    //CUDA memory copying from CPU memory to GPU memory
    //cudaMemcpy(arrayX_GPU, arrayX, sizeof(double)*Nparticles, cudaMemcpyHostToDevice);
    //memcpy(arrayX_GPU, arrayX, sizeof(double)*Nparticles);
    //cudaMemcpy(arrayY_GPU, arrayY, sizeof(double)*Nparticles, cudaMemcpyHostToDevice);
    //memcpy(arrayY_GPU, arrayY, sizeof(double)*Nparticles);
    //cudaMemcpy(xj_GPU, xj, sizeof(double)*Nparticles, cudaMemcpyHostToDevice);
    //memcpy(xj_GPU, xj, sizeof(double)*Nparticles);
    //cudaMemcpy(yj_GPU, yj, sizeof(double)*Nparticles, cudaMemcpyHostToDevice);
    //memcpy(yj_GPU, yj, sizeof(double)*Nparticles);
    //cudaMemcpy(CDF_GPU, CDF, sizeof(double)*Nparticles, cudaMemcpyHostToDevice);
    //memcpy(CDF_GPU, CDF, sizeof(double)*Nparticles);
    //cudaMemcpy(u_GPU, u, sizeof(double)*Nparticles, cudaMemcpyHostToDevice);
    //memcpy(u_GPU, u, sizeof(double)*Nparticles);
    //long long end_copy = get_time();
    //Xil_DCacheDisable();

#ifdef HW
    Xil_DCacheDisable();
    XFcuda_SetEn_fcuda1(&xcore, 1);
    XFcuda_Start(&xcore);
    while (!XFcuda_IsDone(&xcore));
    Xil_DCacheEnable();
#else
    //KERNEL FUNCTION CALL
    int j;
    for(j = 0; j < Nparticles; j++){
      x = findIndex(CDF, Nparticles, u[j]);
      if(x == -1)
        x = Nparticles-1;
      xj[j] = arrayX[x];
      yj[j] = arrayY[x];

    }
#endif
    //cudaThreadSynchronize();
    //long long start_copy_back = get_time();
    //CUDA memory copying back from GPU to CPU memory
    //cudaMemcpy(yj, yj_GPU, sizeof(double)*Nparticles, cudaMemcpyDeviceToHost);
    //memcpy(yj, yj_GPU, sizeof(double)*Nparticles);
    //cudaMemcpy(xj, xj_GPU, sizeof(double)*Nparticles, cudaMemcpyDeviceToHost);
    //memcpy(xj, xj_GPU, sizeof(double)*Nparticles);
    //long long end_copy_back = get_time();
    //printf("SENDING TO GPU TOOK: %lf\n", elapsed_time(start_copy, end_copy));
    //printf("CUDA EXEC TOOK: %lf\n", elapsed_time(end_copy, start_copy_back));
    //printf("SENDING BACK FROM GPU TOOK: %lf\n", elapsed_time(start_copy_back, end_copy_back));
    //long long xyj_time = get_time();
    //printf("TIME TO CALC NEW ARRAY X AND Y TOOK: %f\n", elapsed_time(u_time, xyj_time));

    for(x = 0; x < Nparticles; x++){
      //reassign arrayX and arrayY
      arrayX[x] = xj[x];
      arrayY[x] = yj[x];
      weights[x] = 1/((double)(Nparticles));
    }
    //long long reset = get_time();
    //printf("TIME TO RESET WEIGHTS TOOK: %f\n", elapsed_time(xyj_time, reset));
  }

  //CUDA freeing of memory
  /*cudaFree(u_GPU);
    cudaFree(CDF_GPU);
    cudaFree(yj_GPU);
    cudaFree(xj_GPU);
    cudaFree(arrayY_GPU);
    cudaFree(arrayX_GPU);*/
  /*
     free(u_GPU);
     free(CDF_GPU);
     free(yj_GPU);
     free(xj_GPU);
     free(arrayY_GPU);
     free(arrayX_GPU);
     */
  //free memory
  free(disk);
  free(objxy);
  free(weights);
  free(likelihood);
  free(arrayX);
  free(arrayY);
  free(xj);
  free(yj);
  free(CDF);
  free(u);
  free(ind);

  /*
     FILE *fp = fopen("cuda/gold_output_naive.txt", "r");
     if (fp == NULL) {
     printf("Cannot open file.\n");
     free(result);
     return 0;
     }
     char buffer[50];
     double gold_val;
     for (i = 0; i < 3 * (Nfr - 1); i++) {
     if (feof(fp)) {
     printf("Unexpected end of file.\n");
     free(result);
     return 0;
     }
     fgets(buffer, sizeof(buffer), fp);
     sscanf(buffer, "%lf\n", &gold_val);
     if (gold_val - result[i] < -EPSILON ||
     gold_val - result[i] > EPSILON) {

     printf("Mismatch result at %i: gold = %f, result = %f\n",
     i, gold_val, result[i]);
     free(result);
     return 0;
     }
     }
     free(result);
     */

#ifdef VERIFY
  for (i = 0; i < 3 * (Nfr - 1); i++) {
    printf("index %d: %f\n", i, result[i]);
  }

  for (i = 0; i < 3 * (Nfr - 1); i++) {
    if (fabs(gold_output[i] - result[i]) > EPSILON) {
      printf("Mismatch result at %i: gold = %f, result = %f\n",
          i, gold_output[i], result[i]);
      free(result);
      return 0;
    }
  }
#endif
  free(result);
  return 1;
}
/**
* The implementation of the particle filter using OpenMP for a single image
* @see http://openmp.org/wp/
* @note This function is designed to work with a single image. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
* @warning Use the other particle filter function for videos; the accuracy of this function decreases significantly as it is called repeatedly while processing video
* @param I The image to be run
* @param IszX The x dimension of the image
* @param IszY The y dimension of the image
* @param seed The seed array used for random number generation
* @param Nparticles The number of particles to be used
* @param x_loc The array that will store the x locations of the desired object
* @param y_loc The array that will store the y locations of the desired object
* @param prevX The starting x position of the object
* @param prevY The starting y position of the object
*/
void particleFilter1F(int * I, int IszX, int IszY, int * seed, int Nparticles, double * x_loc, double * y_loc, double prevX, double prevY){
	int max_size = IszX*IszY;

	/*original particle centroid*/
	double xe = prevX;
	double ye = prevY;
	/*expected object locations, compared to center*/
	int radius = 5;
	int diameter = radius*2 -1;
	int * disk = (int *)mxCalloc(diameter*diameter, sizeof(int));
	strelDisk(disk, radius);
	int countOnes = 0;
	int x, y;
	for(x = 0; x < diameter; x++){
		for(y = 0; y < diameter; y++){
			if(disk[x*diameter + y] == 1)
			countOnes++;
		}
	}
	double * objxy = (double *)mxCalloc(countOnes*2, sizeof(double));
	getneighbors(disk, countOnes, objxy, radius);
	

	/*initial weights are all equal (1/Nparticles)*/
	double * weights = (double *)mxCalloc(Nparticles, sizeof(double));
	#pragma omp parallel for shared(weights, Nparticles) private(x)
	for(x = 0; x < Nparticles; x++){
		weights[x] = 1/((double)(Nparticles));
	}

	/*initial likelihood to 0.0*/
	double * likelihood = (double *)mxCalloc(Nparticles, sizeof(double));
	double * arrayX = (double *)mxCalloc(Nparticles, sizeof(double));
	double * arrayY = (double *)mxCalloc(Nparticles, sizeof(double));
	double * xj = (double *)mxCalloc(Nparticles, sizeof(double));
	double * yj = (double *)mxCalloc(Nparticles, sizeof(double));
	double * CDF = (double *)mxCalloc(Nparticles, sizeof(double));
	double * u = (double *)mxCalloc(Nparticles, sizeof(double));
	mxArray * arguments[4];
	mxArray * mxIK = mxCreateDoubleMatrix(IszX, IszY, mxREAL);
	mxArray * mxObj = mxCreateDoubleMatrix(countOnes, 2, mxREAL);
	mxArray * mxX = mxCreateDoubleMatrix(1, Nparticles, mxREAL);
	mxArray * mxY = mxCreateDoubleMatrix(1, Nparticles, mxREAL);
	double * Ik = (double *)mxCalloc(IszX*IszY, sizeof(double));
	mxArray * result = mxCreateDoubleMatrix(1, Nparticles, mxREAL);
	
	#pragma omp parallel for shared(arrayX, arrayY, xe, ye) private(x)
	for(x = 0; x < Nparticles; x++){
		arrayX[x] = xe;
		arrayY[x] = ye;
	}
	int k;
	int indX, indY;
	

	/*apply motion model
		//draws sample from motion model (random walk). The only prior information
		//is that the object moves 2x as fast as in the y direction*/
	#pragma omp parallel for shared(arrayX, arrayY, Nparticles, seed) private(x)
	for(x = 0; x < Nparticles; x++){
		arrayX[x] += 1 + 5*randn(seed, x);
		arrayY[x] += -2 + 2*randn(seed, x);
	}

	/*particle filter likelihood*/
	//get the current image
	for(x = 0; x < IszX; x++)
	{
		for(y = 0; y < IszY; y++)
		{
			Ik[x*IszX + y] = (double)I[x*IszY + y];
		}
	}
	//copy arguments
	memcpy(mxGetPr(mxIK), Ik, sizeof(double)*IszX*IszY);
	memcpy(mxGetPr(mxObj), objxy, sizeof(double)*countOnes);
	memcpy(mxGetPr(mxX), arrayX, sizeof(double)*Nparticles);
	memcpy(mxGetPr(mxY), arrayY, sizeof(double)*Nparticles);
	arguments[0] = mxIK;
	arguments[1] = mxObj;
	arguments[2] = mxX;
	arguments[3] = mxY;
	mexCallMATLAB(1, &result, 4, arguments, "GetSimpleLikelihood");
	memcpy(likelihood, result, sizeof(double)*Nparticles);

	/* update & normalize weights
		// using equation (63) of Arulampalam Tutorial*/
	#pragma omp parallel for shared(Nparticles, weights, likelihood) private(x)
	for(x = 0; x < Nparticles; x++){
		weights[x] = weights[x] * exp(likelihood[x]);
	}

	double sumWeights = 0;
	#pragma omp parallel for private(x) reduction(+:sumWeights)
	for(x = 0; x < Nparticles; x++){
		sumWeights += weights[x];
	}

	#pragma omp parallel for shared(sumWeights, weights) private(x)
	for(x = 0; x < Nparticles; x++){
		weights[x] = weights[x]/sumWeights;
	}

	xe = 0;
	ye = 0;
	/* estimate the object location by expected values*/
	#pragma omp parallel for private(x) reduction(+:xe, ye)
	for(x = 0; x < Nparticles; x++){
		xe += arrayX[x] * weights[x];
		ye += arrayY[x] * weights[x];
	}
	

	x_loc[0] = xe+.5;
	y_loc[0] = ye+.5;

	/*display(hold off for now)
		
		//pause(hold off for now)
		
		//resampling*/
	
	
	CDF[0] = weights[0];
	for(x = 1; x < Nparticles; x++){
		CDF[x] = weights[x] + CDF[x-1];
	}

	double u1 = (1/((double)(Nparticles)))*randu(seed, 0);
	#pragma omp parallel for shared(u, u1, Nparticles) private(x)
	for(x = 0; x < Nparticles; x++){
		u[x] = u1 + x/((double)(Nparticles));
	}

	int j, i;
	
	#pragma omp parallel for shared(CDF, Nparticles, xj, yj, u, arrayX, arrayY) private(i, j)
	for(j = 0; j < Nparticles; j++){
		i = findIndex(CDF, Nparticles, u[j]);
		/*i = findIndexBin(CDF, 0, Nparticles, u[j]);*/
		if(i == -1)
		i = Nparticles-1;
		xj[j] = arrayX[i];
		yj[j] = arrayY[i];
		
	}

	/*reassign arrayX and arrayY*/
	#pragma omp parallel for shared(weights, arrayX, arrayY, xj, yj, Nparticles) private(x)
	for(x = 0; x < Nparticles; x++){
		weights[x] = 1/((double)(Nparticles));
		arrayX[x] = xj[x];
		arrayY[x] = yj[x];
	}

	
	mxFree(disk);
	mxFree(weights);
	mxFree(objxy);	
	mxFree(likelihood);
	mxFree(arrayX);
	mxFree(arrayY);
	mxFree(CDF);
	mxFree(u);
	mxFree(xj);
	mxFree(yj);
	mxFree(Ik);
}
/**
 * The implementation of the particle filter using OpenMP for many frames
 * @see http://openmp.org/wp/
 * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
 * @param I The video to be run
 * @param IszX The x dimension of the video
 * @param IszY The y dimension of the video
 * @param Nfr The number of frames
 * @param seed The seed array used for random number generation
 * @param Nparticles The number of particles to be used
 */
int particleFilter(unsigned char * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles) {
    int max_size = IszX * IszY*Nfr;
    //original particle centroid
    double xe = roundDouble(IszY / 2.0);
    double ye = roundDouble(IszX / 2.0);

    //expected object locations, compared to center
    int radius = 5;
    int diameter = radius * 2 - 1;
    int * disk = (int*) calloc(diameter * diameter, sizeof (int));
    strelDisk(disk, radius);
    int countOnes = 0;
    int x, y;
    for (x = 0; x < diameter; x++) {
        for (y = 0; y < diameter; y++) {
            if (disk[x * diameter + y] == 1)
                countOnes++;
        }
    }
    int * objxy = (int *) calloc(countOnes * 2, sizeof(int));
    getneighbors(disk, countOnes, objxy, radius);
    //initial weights are all equal (1/Nparticles)
    double * weights = (double *) calloc(Nparticles, sizeof(double));
    for (x = 0; x < Nparticles; x++) {
        weights[x] = 1 / ((double) (Nparticles));
    }
    /****************************************************************
     **************   B E G I N   A L L O C A T E *******************
     ****************************************************************/

    /***** kernel variables ******/
    cl_kernel kernel_likelihood;
    cl_kernel kernel_sum;
    cl_kernel kernel_normalize_weights;
    cl_kernel kernel_find_index;

    int sourcesize = 2048 * 2048;
    char * source = (char *) calloc(sourcesize, sizeof (char));
    if (!source) {
        printf("ERROR: calloc(%d) failed\n", sourcesize);
        return -1;
    }

    // read the kernel core source
    char * tempchar = "./particle_double.cl";
    FILE * fp = fopen(tempchar, "rb");
    if (!fp) {
        printf("ERROR: unable to open '%s'\n", tempchar);
        return -1;
    }
    fread(source + strlen(source), sourcesize, 1, fp);
    fclose(fp);

    // OpenCL initialization
    int use_gpu = 1;
    if (initialize(use_gpu)) return -1;

    // compile kernel
    cl_int err = 0;
    const char * slist[2] = {source, 0};
    cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateProgramWithSource() => %d\n", err);
        return -1;
    }

    err = clBuildProgram(prog, 1, device_list, "-cl-fast-relaxed-math", NULL, NULL);

    if (err != CL_SUCCESS) {
        if (err == CL_INVALID_PROGRAM)
            printf("CL_INVALID_PROGRAM\n");
        else if (err == CL_INVALID_VALUE)
            printf("CL_INVALID_VALUE\n");
        else if (err == CL_INVALID_DEVICE)
            printf("CL_INVALID_DEVICE\n");
        else if (err == CL_INVALID_BINARY)
            printf("CL_INVALID_BINARY\n");
        else if (err == CL_INVALID_BUILD_OPTIONS)
            printf("CL_INVALID_BUILD_OPTIONS\n");
        else if (err == CL_INVALID_OPERATION)
            printf("CL_INVALID_OPERATION\n");
        else if (err == CL_COMPILER_NOT_AVAILABLE)
            printf("CL_COMPILER_NOT_AVAILABLE\n");
        else if (err == CL_BUILD_PROGRAM_FAILURE)
            printf("CL_BUILD_PROGRAM_FAILURE\n");
        else if (err == CL_INVALID_OPERATION)
            printf("CL_INVALID_OPERATION\n");
        else if (err == CL_OUT_OF_RESOURCES)
            printf("CL_OUT_OF_RESOURCES\n");
        else if (err == CL_OUT_OF_HOST_MEMORY)
            printf("CL_OUT_OF_HOST_MEMORY\n");

        printf("ERROR: clBuildProgram() => %d\n", err);

        static char log[65536];
        memset(log, 0, sizeof (log));

        err = clGetProgramBuildInfo(prog, device_list[0], CL_PROGRAM_BUILD_LOG, sizeof (log) - 1, log, NULL);
        if (err != CL_SUCCESS) {
            printf("ERROR: clGetProgramBuildInfo() => %d\n", err);
        }
        if (strstr(log, "warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);


    }
    // { // show warnings/errors
    //     static char log[65536];
    //     memset(log, 0, sizeof (log));
    //     cl_device_id device_id[2] = {0};
    //     err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof (device_id), device_id, NULL);
    //     if (err != CL_SUCCESS) {
    //         if (err == CL_INVALID_CONTEXT)
    //             printf("ERROR: clGetContextInfo() => CL_INVALID_CONTEXT\n");
    //         if (err == CL_INVALID_VALUE)
    //             printf("ERROR: clGetContextInfo() => CL_INVALID_VALUE\n");
    //     }
    // }//*/

    char * s_likelihood_kernel = "likelihood_kernel";
    char * s_sum_kernel = "sum_kernel";
    char * s_normalize_weights_kernel = "normalize_weights_kernel";
    char * s_find_index_kernel = "find_index_kernel";

    kernel_likelihood = clCreateKernel(prog, s_likelihood_kernel, &err);
    if (err != CL_SUCCESS) {
        if (err == CL_INVALID_PROGRAM)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM %d\n", err);
        if (err == CL_INVALID_PROGRAM_EXECUTABLE)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM EXECUTABLE %d\n", err);
        if (err == CL_INVALID_KERNEL_NAME)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL NAME %d\n", err);
        if (err == CL_INVALID_KERNEL_DEFINITION)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL DEFINITION %d\n", err);
        if (err == CL_INVALID_VALUE)
            printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID CL_INVALID_VALUE %d\n", err);
        printf("ERROR: clCreateKernel(likelihood_kernel) failed.\n");
        return -1;
    }
    kernel_sum = clCreateKernel(prog, s_sum_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(sum_kernel) 0 => %d\n", err);
        return -1;
    }
    kernel_normalize_weights = clCreateKernel(prog, s_normalize_weights_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(normalize_weights_kernel) 0 => %d\n", err);
        return -1;
    }
    kernel_find_index = clCreateKernel(prog, s_find_index_kernel, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateKernel(find_index_kernel) 0 => %d\n", err);
        return -1;
    }


    //initial likelihood to 0.0
    double * likelihood = (double *) calloc(Nparticles + 1, sizeof (double));
    double * arrayX = (double *) calloc(Nparticles, sizeof (double));
    double * arrayY = (double *) calloc(Nparticles, sizeof (double));
    double * xj = (double *) calloc(Nparticles, sizeof (double));
    double * yj = (double *) calloc(Nparticles, sizeof (double));
    double * CDF = (double *) calloc(Nparticles, sizeof(double));

    //GPU copies of arrays
    cl_mem arrayX_GPU;
    cl_mem arrayY_GPU;
    cl_mem xj_GPU;
    cl_mem yj_GPU;
    cl_mem CDF_GPU;
    cl_mem likelihood_GPU;
    cl_mem I_GPU;
    cl_mem weights_GPU;
    cl_mem objxy_GPU;

    int * ind = (int*) calloc(countOnes, sizeof(int));
    cl_mem ind_GPU;
    double * u = (double *) calloc(Nparticles, sizeof(double));
    cl_mem u_GPU;
    cl_mem seed_GPU;
    cl_mem partial_sums;


    //OpenCL memory allocation

    arrayX_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    arrayY_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    xj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer xj_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    yj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer yj_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    CDF_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) * Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer CDF_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    u_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer u_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    likelihood_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer likelihood_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    weights_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer weights_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    I_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (unsigned char) *IszX * IszY * Nfr, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer I_GPU (size:%d) => %d\n", IszX * IszY * Nfr, err);
        return -1;
    }
    objxy_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*sizeof (int) *countOnes, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer objxy_GPU (size:%d) => %d\n", countOnes, err);
        return -1;
    }
    ind_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *countOnes * Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer ind_GPU (size:%d) => %d\n", countOnes * Nparticles, err);
        return -1;
    }
    seed_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *Nparticles, NULL, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer seed_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    partial_sums = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof (double) * Nparticles + 1, likelihood, &err);
    if (err != CL_SUCCESS) {
        printf("ERROR: clCreateBuffer partial_sums (size:%d) => %d\n", Nparticles, err);
        return -1;
    }

    //Donnie - this loop is different because in this kernel, arrayX and arrayY
    //  are set equal to xj before every iteration, so effectively, arrayX and
    //  arrayY will be set to xe and ye before the first iteration.
    for (x = 0; x < Nparticles; x++) {

        xj[x] = xe;
        yj[x] = ye;
    }

    int k;
    //double * Ik = (double *)calloc(IszX*IszY, sizeof(double));
    int indX, indY;
    //start send
    long long send_start = get_time();

    //OpenCL memory copy
    err = clEnqueueWriteBuffer(cmd_queue, I_GPU, 1, 0, sizeof (unsigned char) *IszX * IszY*Nfr, I, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer I_GPU (size:%d) => %d\n", IszX * IszY*Nfr, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, objxy_GPU, 1, 0, 2*sizeof (int) *countOnes, objxy, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer objxy_GPU (size:%d) => %d\n", countOnes, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer weights_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, xj_GPU, 1, 0, sizeof (double) *Nparticles, xj, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, yj_GPU, 1, 0, sizeof (double) *Nparticles, yj, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    err = clEnqueueWriteBuffer(cmd_queue, seed_GPU, 1, 0, sizeof (int) *Nparticles, seed, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: clEnqueueWriteBuffer seed_GPU (size:%d) => %d\n", Nparticles, err);
        return -1;
    }
    /**********************************************************************
     *********** E N D    A L L O C A T E ********************************
     *********************************************************************/

    long long send_end = get_time();
    printf("TIME TO SEND TO GPU: %f\n", elapsed_time(send_start, send_end));
    int num_blocks = ceil((double) Nparticles / (double) threads_per_block);
    printf("threads_per_block=%d \n",threads_per_block);
    size_t local_work[3] = {threads_per_block, 1, 1};
    size_t global_work[3] = {num_blocks*threads_per_block, 1, 1};

    for (k = 1; k < Nfr; k++) {
        /****************** L I K E L I H O O D ************************************/
        clSetKernelArg(kernel_likelihood, 0, sizeof (void *), (void*) &arrayX_GPU);
        clSetKernelArg(kernel_likelihood, 1, sizeof (void *), (void*) &arrayY_GPU);
        clSetKernelArg(kernel_likelihood, 2, sizeof (void *), (void*) &xj_GPU);
        clSetKernelArg(kernel_likelihood, 3, sizeof (void *), (void*) &yj_GPU);
        clSetKernelArg(kernel_likelihood, 4, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_likelihood, 5, sizeof (void *), (void*) &ind_GPU);
        clSetKernelArg(kernel_likelihood, 6, sizeof (void *), (void*) &objxy_GPU);
        clSetKernelArg(kernel_likelihood, 7, sizeof (void *), (void*) &likelihood_GPU);
        clSetKernelArg(kernel_likelihood, 8, sizeof (void *), (void*) &I_GPU);
        clSetKernelArg(kernel_likelihood, 9, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_likelihood, 10, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_likelihood, 11, sizeof (cl_int), (void*) &Nparticles);
        clSetKernelArg(kernel_likelihood, 12, sizeof (cl_int), (void*) &countOnes);
        clSetKernelArg(kernel_likelihood, 13, sizeof (cl_int), (void*) &max_size);
        clSetKernelArg(kernel_likelihood, 14, sizeof (cl_int), (void*) &k);
        clSetKernelArg(kernel_likelihood, 15, sizeof (cl_int), (void*) &IszY);
        clSetKernelArg(kernel_likelihood, 16, sizeof (cl_int), (void*) &Nfr);
        clSetKernelArg(kernel_likelihood, 17, sizeof (void *), (void*) &seed_GPU);
        clSetKernelArg(kernel_likelihood, 18, sizeof (void *), (void*) &partial_sums);
        clSetKernelArg(kernel_likelihood, 19, threads_per_block * sizeof (double), NULL);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_likelihood, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(kernel_likelihood)=>%d failed\n", err);
            //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /****************** E N D    L I K E L I H O O D **********************/
        /*************************** S U M ************************************/
        clSetKernelArg(kernel_sum, 0, sizeof (void *), (void*) &partial_sums);
        clSetKernelArg(kernel_sum, 1, sizeof (cl_int), (void*) &Nparticles);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_sum, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(kernel_sum)=>%d failed\n", err);
            //check_error(err, __FILE__, __LINE__);
            return -1;
        }/*************************** E N D   S U M ****************************/



        /**************** N O R M A L I Z E     W E I G H T S *****************/
        clSetKernelArg(kernel_normalize_weights, 0, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_normalize_weights, 1, sizeof (cl_int), (void*) &Nparticles);
        clSetKernelArg(kernel_normalize_weights, 2, sizeof (void *), (void*) &partial_sums); //*/
        clSetKernelArg(kernel_normalize_weights, 3, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_normalize_weights, 4, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_normalize_weights, 5, sizeof (void *), (void*) &seed_GPU);

        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_normalize_weights, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(normalize_weights)=>%d failed\n", err);
            //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /************* E N D    N O R M A L I Z E     W E I G H T S ***********/

        //	ocl_print_double_array(cmd_queue, partial_sums, 40);
        // /********* I N T E R M E D I A T E     R E S U L T S ***************/
        // //OpenCL memory copying back from GPU to CPU memory
        err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0);
        err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0);
        err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);

        xe = 0;
        ye = 0;
        double total=0.0;
        // estimate the object location by expected values
        for (x = 0; x < Nparticles; x++) {
            // if( 0.0000000 < arrayX[x]*weights[x]) printf("arrayX[%d]:%f, arrayY[%d]:%f, weights[%d]:%0.10f\n",x,arrayX[x], x, arrayY[x], x, weights[x]);
            //	printf("arrayX[%d]:%f | arrayY[%d]:%f | weights[%d]:%f\n",
            //		x, arrayX[x], x, arrayY[x], x, weights[x]);
            xe += arrayX[x] * weights[x];
            ye += arrayY[x] * weights[x];
            total+= weights[x];
        }
        printf("total weight: %lf\n", total);
        printf("XE: %lf\n", xe);
        printf("YE: %lf\n", ye);
        double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2));
        printf("%lf\n", distance);
        // /********* E N D    I N T E R M E D I A T E     R E S U L T S ***************/

        /******************** F I N D    I N D E X ****************************/
        //Set number of threads

        clSetKernelArg(kernel_find_index, 0, sizeof (void *), (void*) &arrayX_GPU);
        clSetKernelArg(kernel_find_index, 1, sizeof (void *), (void*) &arrayY_GPU);
        clSetKernelArg(kernel_find_index, 2, sizeof (void *), (void*) &CDF_GPU);
        clSetKernelArg(kernel_find_index, 3, sizeof (void *), (void*) &u_GPU);
        clSetKernelArg(kernel_find_index, 4, sizeof (void *), (void*) &xj_GPU);
        clSetKernelArg(kernel_find_index, 5, sizeof (void *), (void*) &yj_GPU);
        clSetKernelArg(kernel_find_index, 6, sizeof (void *), (void*) &weights_GPU);
        clSetKernelArg(kernel_find_index, 7, sizeof (cl_int), (void*) &Nparticles);
        //KERNEL FUNCTION CALL
        err = clEnqueueNDRangeKernel(cmd_queue, kernel_find_index, 1, NULL, global_work, local_work, 0, 0, 0);
        clFinish(cmd_queue);
        if (err != CL_SUCCESS) {
            printf("ERROR: clEnqueueNDRangeKernel(find_index)=>%d failed\n", err);
            //check_error(err, __FILE__, __LINE__);
            return -1;
        }
        /******************* E N D    F I N D    I N D E X ********************/

    }//end loop

    //block till kernels are finished
    //clFinish(cmd_queue);
    long long back_time = get_time();

    //OpenCL freeing of memory
    clReleaseProgram(prog);
    clReleaseMemObject(u_GPU);
    clReleaseMemObject(CDF_GPU);
    clReleaseMemObject(yj_GPU);
    clReleaseMemObject(xj_GPU);
    clReleaseMemObject(likelihood_GPU);
    clReleaseMemObject(I_GPU);
    clReleaseMemObject(objxy_GPU);
    clReleaseMemObject(ind_GPU);
    clReleaseMemObject(seed_GPU);
    clReleaseMemObject(partial_sums);

    long long free_time = get_time();

    //OpenCL memory copying back from GPU to CPU memory
    err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long arrayX_time = get_time();
    err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long arrayY_time = get_time();
    err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0);
    if (err != CL_SUCCESS) {
        printf("ERROR: Memcopy Out\n");
        return -1;
    }
    long long back_end_time = get_time();

    printf("GPU Execution: %lf\n", elapsed_time(send_end, back_time));
    printf("FREE TIME: %lf\n", elapsed_time(back_time, free_time));
    printf("SEND TO SEND BACK: %lf\n", elapsed_time(back_time, back_end_time));
    printf("SEND ARRAY X BACK: %lf\n", elapsed_time(free_time, arrayX_time));
    printf("SEND ARRAY Y BACK: %lf\n", elapsed_time(arrayX_time, arrayY_time));
    printf("SEND WEIGHTS BACK: %lf\n", elapsed_time(arrayY_time, back_end_time));

    xe = 0;
    ye = 0;
    // estimate the object location by expected values
    for (x = 0; x < Nparticles; x++) {
        xe += arrayX[x] * weights[x];
        ye += arrayY[x] * weights[x];
    }
    double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2));

    //Output results
    FILE *fid;
    fid=fopen("output.txt", "w+");
    if( fid == NULL ) {
        printf( "The file was not opened for writing\n" );
        return -1;
    }
    fprintf(fid, "XE: %lf\n", xe);
    fprintf(fid, "YE: %lf\n", ye);
    fprintf(fid, "distance: %lf\n", distance);
    fclose(fid);


    //OpenCL freeing of memory
    clReleaseMemObject(weights_GPU);
    clReleaseMemObject(arrayY_GPU);
    clReleaseMemObject(arrayX_GPU);

    //free regular memory
    free(likelihood);
    free(arrayX);
    free(arrayY);
    free(xj);
    free(yj);
    free(CDF);
    free(ind);
    free(u);
}