void compute_step_factor(int nelr, double* variables, double* areas, double* step_factors)
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
		double density = variables[NVAR*i + VAR_DENSITY];

		cfd_double3 momentum;
		momentum.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy = variables[NVAR*i + VAR_DENSITY_ENERGY];
		cfd_double3 velocity;	   compute_velocity(density, momentum, velocity);
		double speed_sqd      = compute_speed_sqd(velocity);
		double pressure       = compute_pressure(density, density_energy, speed_sqd);
		double speed_of_sound = compute_speed_of_sound(density, pressure);

		// dt = double(0.5) * std::sqrt(areas[i]) /  (||v|| + c).... but when we do time stepping, this later would need to be divided by the area, so we just do it all at once
		step_factors[i] = double(0.5) / (std::sqrt(areas[i]) * (std::sqrt(speed_sqd) + speed_of_sound));
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma155_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

Exemple #2
 * Verifies the correctness of the sort. 
 * Ensures all keys are within a PE's bucket boundaries.
 * Ensures the final number of keys is equal to the initial.
static int verify_results(int const * const my_local_key_counts,
                           KEY_TYPE const * const my_local_keys)


  int error = 0;

  const int my_rank = shmem_my_pe();

  const int my_min_key = my_rank * BUCKET_WIDTH;
  const int my_max_key = (my_rank+1) * BUCKET_WIDTH - 1;

  unsigned long long start = current_time_ns();

  // Verify all keys are within bucket boundaries
  for(long long int i = 0; i < my_bucket_size; ++i){
    const int key = my_local_keys[i];
    if((key < my_min_key) || (key > my_max_key)){
      printf("Rank %d Failed Verification!\n",my_rank);
      printf("Key: %d is outside of bounds [%d, %d]\n", key, my_min_key, my_max_key);
      error = 1;

  unsigned long long end = current_time_ns();
  if (shmem_my_pe() == 0)
  printf("Verifying took %llu ns\n", end - start);

  // Verify the sum of the key population equals the expected bucket size
  long long int bucket_size_test = 0;
  for(uint64_t i = 0; i < BUCKET_WIDTH; ++i){
    bucket_size_test +=  my_local_key_counts[i];
  if(bucket_size_test != my_bucket_size){
      printf("Rank %d Failed Verification!\n",my_rank);
      printf("Actual Bucket Size: %lld Should be %lld\n", bucket_size_test, my_bucket_size);
      error = 1;

  // Verify the final number of keys equals the initial number of keys
  static long long int total_num_keys = 0;
  shmem_longlong_sum_to_all(&total_num_keys, &my_bucket_size, 1, 0, 0, NUM_PES, llWrk, pSync);

  if(total_num_keys != (long long int)(NUM_KEYS_PER_PE * NUM_PES)){
    if(my_rank == ROOT_PE){
      printf("Verification Failed!\n");
      printf("Actual total number of keys: %lld Expected %" PRIu64 "\n", total_num_keys, NUM_KEYS_PER_PE * NUM_PES );
      error = 1;

  return error;
void copy(double *dst, double *src, int N)
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < N; i++)
		dst[i] = src[i];
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma53_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

void initialize_variables(int nelr, double* variables)
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
		for(int j = 0; j < NVAR; j++) variables[i*NVAR + j] = ff_variable[j];
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma102_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

Exemple #5
 * Counts the occurence of each key in my bucket. 
 * Key indices into the count array are the key's value minus my bucket's 
 * minimum key value to allow indexing from 0.
 * my_bucket_keys: All keys in my bucket unsorted [my_rank * BUCKET_WIDTH, (my_rank+1)*BUCKET_WIDTH)
static int * count_local_keys(KEY_TYPE const * const my_bucket_keys)
  int * const my_local_key_counts = malloc(BUCKET_WIDTH * sizeof(int));
  memset(my_local_key_counts, 0, BUCKET_WIDTH * sizeof(int));


  const int my_rank = shmem_my_pe();
  const int my_min_key = my_rank * BUCKET_WIDTH;

  unsigned long long start = current_time_ns();

  // Count the occurences of each key in my bucket
  for(long long int i = 0; i < my_bucket_size; ++i){
    const unsigned int key_index = my_bucket_keys[i] - my_min_key;

    assert(my_bucket_keys[i] >= my_min_key);
    assert(key_index < BUCKET_WIDTH);


  unsigned long long end = current_time_ns();
  if (shmem_my_pe() == 0)
  printf("Counting local took %llu ns, my_bucket_size = %u, BUCKET_WIDTH = "
          "%llu\n", end - start, my_bucket_size, BUCKET_WIDTH);


#ifdef DEBUG
  char msg[4096];
  sprintf(msg,"Rank %d: Bucket Size %lld | Local Key Counts:", my_rank, my_bucket_size);
  for(uint64_t i = 0; i < BUCKET_WIDTH; ++i){
    if(i < PRINT_MAX)
    sprintf(msg + strlen(msg),"%d ", my_local_key_counts[i]);
  sprintf(msg + strlen(msg),"\n");

  return my_local_key_counts;
Exemple #6
 * Places local keys into their corresponding local bucket.
 * The contents of each bucket are not sorted.
static KEY_TYPE * bucketize_local_keys(KEY_TYPE const * const my_keys,
                                              int * const local_bucket_offsets)
  KEY_TYPE * const my_local_bucketed_keys = malloc(NUM_KEYS_PER_PE * sizeof(KEY_TYPE));


  unsigned long long start = current_time_ns();

  for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){
    const KEY_TYPE key = my_keys[i];
    const uint32_t bucket_index = key / BUCKET_WIDTH;
    uint32_t index;
    assert(local_bucket_offsets[bucket_index] >= 0);
    index = local_bucket_offsets[bucket_index]++;
    assert(index < NUM_KEYS_PER_PE);
    my_local_bucketed_keys[index] = key;

  unsigned long long end = current_time_ns();
  if (shmem_my_pe() == 0)
  printf("Bucketizing took %llu ns\n", end - start);


#ifdef DEBUG
  char msg[1024];
  const int my_rank = shmem_my_pe();
  sprintf(msg,"Rank %d: local bucketed keys: ", my_rank);
  for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){
    if(i < PRINT_MAX)
    sprintf(msg + strlen(msg),"%d ", my_local_bucketed_keys[i]);
  sprintf(msg + strlen(msg),"\n");
  return my_local_bucketed_keys;
void time_step(int j, int nelr, double* old_variables, double* variables, double* step_factors, double* fluxes)
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
		double factor = step_factors[i]/double(RK+1-j);

		variables[NVAR*i + VAR_DENSITY] = old_variables[NVAR*i + VAR_DENSITY] + factor*fluxes[NVAR*i + VAR_DENSITY];
		variables[NVAR*i + VAR_DENSITY_ENERGY] = old_variables[NVAR*i + VAR_DENSITY_ENERGY] + factor*fluxes[NVAR*i + VAR_DENSITY_ENERGY];
		variables[NVAR*i + (VAR_MOMENTUM+0)] = old_variables[NVAR*i + (VAR_MOMENTUM+0)] + factor*fluxes[NVAR*i + (VAR_MOMENTUM+0)];
		variables[NVAR*i + (VAR_MOMENTUM+1)] = old_variables[NVAR*i + (VAR_MOMENTUM+1)] + factor*fluxes[NVAR*i + (VAR_MOMENTUM+1)];
		variables[NVAR*i + (VAR_MOMENTUM+2)] = old_variables[NVAR*i + (VAR_MOMENTUM+2)] + factor*fluxes[NVAR*i + (VAR_MOMENTUM+2)];
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma317_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

Exemple #8
 * Computes the size of each bucket by iterating all keys and incrementing
 * their corresponding bucket's size
static int * count_local_bucket_sizes(KEY_TYPE const * const my_keys)
  int * const local_bucket_sizes = malloc(NUM_BUCKETS * sizeof(int));


  init_array(local_bucket_sizes, NUM_BUCKETS);

  unsigned long long start = current_time_ns();

  for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){
    const uint32_t bucket_index = my_keys[i]/BUCKET_WIDTH;

  unsigned long long end = current_time_ns();
  if (shmem_my_pe() == 0)
  printf("Counting local bucket sizes took %llu ns\n", end - start);


#ifdef DEBUG
  char msg[1024];
  const int my_rank = shmem_my_pe();
  sprintf(msg,"Rank %d: local bucket sizes: ", my_rank);
  for(uint64_t i = 0; i < NUM_BUCKETS; ++i){
    if(i < PRINT_MAX)
    sprintf(msg + strlen(msg),"%d ", local_bucket_sizes[i]);
  sprintf(msg + strlen(msg),"\n");

  return local_bucket_sizes;
Exemple #9
void hclib_launch(generic_frame_ptr fct_ptr, void *arg, const char **deps,
        int ndeps) {
    unsigned long long start_time = 0;
    unsigned long long end_time;

    const int instrument = (getenv("HCLIB_INSTRUMENT") != NULL);

    hclib_init(deps, ndeps, instrument);

    if (profile_launch_body) {
        start_time = current_time_ns();
    hclib_async(fct_ptr, arg, NULL, 0, hclib_get_closest_locale());
    if (profile_launch_body) {
        end_time = current_time_ns();
        printf("\nHCLIB TIME %llu ns\n", end_time - start_time);
Exemple #10
void sim_village_main_par(struct Village *top)
    long i;
const unsigned long long full_program_start = current_time_ns();
#pragma omp parallel 
#pragma omp single 
#pragma omp task untied
                    for (i = 0; i < sim_time; i++) sim_village_par(top);   
    } ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);

Exemple #11
 * Generates uniformly random keys [0, MAX_KEY_VAL] on each rank using the time and rank
 * number as a seed
static KEY_TYPE * make_input(void)

  KEY_TYPE * const my_keys = malloc(NUM_KEYS_PER_PE * sizeof(KEY_TYPE));

  pcg32_random_t rng = seed_my_rank();

  unsigned long long start = current_time_ns();

  for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i) {
    my_keys[i] = pcg32_boundedrand_r(&rng, MAX_KEY_VAL);

  unsigned long long end = current_time_ns();
  if (shmem_my_pe() == 0)
  printf("Making input took %llu ns\n", end - start);


#ifdef DEBUG
  char msg[1024];
  const int my_rank = shmem_my_pe();
  sprintf(msg,"Rank %d: Initial Keys: ", my_rank);
  for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){
    if(i < PRINT_MAX)
    sprintf(msg + strlen(msg),"%d ", my_keys[i]);
  sprintf(msg + strlen(msg),"\n");
  return my_keys;
Exemple #12
/*---< main() >-------------------------------------------------------------*/
int main(int argc, char **argv) {
           int     opt;
    extern char   *optarg;
    extern int     optind;
           int     nclusters=5;
           char   *filename = 0;           
           float  *buf;
           float *attributes;
           float *cluster_centres=NULL;
           int     i, j;
           int     numAttributes;
           int     numObjects;        
           char    line[1024];           
           int     isBinaryFile = 0;
           int     nloops = 1;
           float   threshold = 0.001;
		   double  timing;		   

	while ( (opt=getopt(argc,argv,"i:k:t:b:n:?"))!= EOF) {
		switch (opt) {
            case 'i': filename=optarg;
            case 'b': isBinaryFile = 1;
            case 't': threshold=atof(optarg);
            case 'k': nclusters = atoi(optarg);
            case '?': usage(argv[0]);
            default: usage(argv[0]);

    if (filename == 0) usage(argv[0]);

    numAttributes = numObjects = 0;

    /* from the input file, get the numAttributes and numObjects ------------*/
    if (isBinaryFile) {
        int infile;
        if ((infile = open(filename, O_RDONLY, "0600")) == -1) {
            fprintf(stderr, "Error: no such file (%s)\n", filename);
        read(infile, &numObjects,    sizeof(int));
        read(infile, &numAttributes, sizeof(int));

        /* allocate space for attributes[] and read attributes of all objects */
        attributes    = (float*) malloc(numObjects * numAttributes * sizeof(float));

        read(infile, attributes, numObjects*numAttributes*sizeof(float));

    else {
        FILE *infile;
        if ((infile = fopen(filename, "r")) == NULL) {
            fprintf(stderr, "Error: no such file (%s)\n", filename);
        while (fgets(line, 1024, infile) != NULL)
            if (strtok(line, " \t\n") != 0)
        while (fgets(line, 1024, infile) != NULL) {
            if (strtok(line, " \t\n") != 0) {
                /* ignore the id (first attribute): numAttributes = 1; */
                while (strtok(NULL, " ,\t\n") != NULL) numAttributes++;

        /* allocate space for attributes[] and read attributes of all objects */
        attributes           = (float*) malloc(numObjects*numAttributes*sizeof(float));
        i = 0;
        while (fgets(line, 1024, infile) != NULL) {
            if (strtok(line, " \t\n") == NULL) continue; 
            for (j=0; j<numAttributes; j++) {
                attributes[i] = atof(strtok(NULL, " ,\t\n"));
	printf("I/O completed\n");	

const unsigned long long full_program_start = current_time_ns();
for (i=0; i<nloops; i++) {
        cluster_centres = NULL;
                attributes,           /* [numObjects][numAttributes] */                
    } ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);


	printf("number of Clusters %d\n",nclusters); 
	printf("number of Attributes %d\n\n",numAttributes); 
  /*  	printf("Cluster Centers Output\n"); 
	printf("The first number is cluster number and the following data is arribute value\n");
    for (i=0; i< nclusters; i++) {
		printf("%d: ", i);
        for (j=0; j<numAttributes; j++)
            printf("%.2f ", cluster_centres[i][j]);

Exemple #13
int main()
  int l;
  uint64_t start, end;
  long int diff = 0;

  int i, j, k; 
  long double h, t1, t2, dppi, ans = 5.795776322412856L;
  long double s1; 

  // variables for logging/checking
  long double log[INPUTS];
  long double threshold = 0.0; 
  long double epsilon = -4.0;

  // 0. read input from the file final_inputs
  int finputs[INPUTS];

  FILE* infile = fopen("final_inputs", "r");
  if (!infile) 
    printf("Could not open final_inputs\n");

  char *s = malloc(10);
  for (i = 0; i < INPUTS; i++)
    if (!feof(infile))
      fscanf(infile, "%s", s);
      finputs[i] = (int)cov_deserialize(s, 10);

  // dummy calls

  start = current_time_ns();
  for (l = 0; l < INPUTS; l++)
    int n = finputs[l];
    t1 = -1.0;
    dppi = acos(t1);
    s1 = 0.0;
    t1 = 0.0;
    h = dppi / n;

    for( i = 1; i <= n; i++ )
      t2 = fun (i * h);
      s1 = s1 + sqrt (h*h + (t2 - t1)*(t2 - t1));
      t1 = t2;

    // 1. compute threshold and record result
    log[l] = (long double) s1;
    if (s1*pow(10, epsilon) > threshold)
      threshold = s1*pow(10, epsilon);
  end = current_time_ns();

  diff = (end-start);

  // 2. create spec, or checking results
  cov_arr_spec_log("spec.cov", threshold, INPUTS, log);
  cov_arr_log(log, INPUTS, "result", "log.cov");
  cov_check("log.cov", "spec.cov", INPUTS);

  // 3. print score (diff) to a file
  FILE* file;
  file = fopen("score.cov", "w");
  fprintf(file, "%ld\n", diff);

  return 0;

 * Main function
int main(int argc, char** argv)
	if (argc < 2)
		std::cout << "specify data file name" << std::endl;
		return 0;
	const char* data_file_name = argv[1];

const unsigned long long full_program_start = current_time_ns();
	// set far field conditions
		const double angle_of_attack = double(3.1415926535897931 / 180.0) * double(deg_angle_of_attack);

		ff_variable[VAR_DENSITY] = double(1.4);

		double ff_pressure = double(1.0);
		double ff_speed_of_sound = sqrt(GAMMA*ff_pressure / ff_variable[VAR_DENSITY]);
		double ff_speed = double(ff_mach)*ff_speed_of_sound;

		cfd_double3 ff_velocity;
		ff_velocity.x = ff_speed*double(cos((double)angle_of_attack));
		ff_velocity.y = ff_speed*double(sin((double)angle_of_attack));
		ff_velocity.z = 0.0;

		ff_variable[VAR_MOMENTUM+0] = ff_variable[VAR_DENSITY] * ff_velocity.x;
		ff_variable[VAR_MOMENTUM+1] = ff_variable[VAR_DENSITY] * ff_velocity.y;
		ff_variable[VAR_MOMENTUM+2] = ff_variable[VAR_DENSITY] * ff_velocity.z;

		ff_variable[VAR_DENSITY_ENERGY] = ff_variable[VAR_DENSITY]*(double(0.5)*(ff_speed*ff_speed)) + (ff_pressure / double(GAMMA-1.0));

		cfd_double3 ff_momentum;
		ff_momentum.x = *(ff_variable+VAR_MOMENTUM+0);
		ff_momentum.y = *(ff_variable+VAR_MOMENTUM+1);
		ff_momentum.z = *(ff_variable+VAR_MOMENTUM+2);
		compute_flux_contribution(ff_variable[VAR_DENSITY], ff_momentum, ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy);
	int nel;
	int nelr;

	// read in domain geometry
	double* areas;
	int* elements_surrounding_elements;
	double* normals;
		std::ifstream file(data_file_name);

		file >> nel;
		nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));

		areas = new double[nelr];
		elements_surrounding_elements = new int[nelr*NNB];
		normals = new double[NDIM*NNB*nelr];

		// read in data
		for(int i = 0; i < nel; i++)
			file >> areas[i];
			for(int j = 0; j < NNB; j++)
				file >> elements_surrounding_elements[i*NNB + j];
				if(elements_surrounding_elements[i*NNB+j] < 0) elements_surrounding_elements[i*NNB+j] = -1;
				elements_surrounding_elements[i*NNB + j]--; //it's coming in with Fortran numbering

				for(int k = 0; k < NDIM; k++)
					file >>  normals[(i*NNB + j)*NDIM + k];
					normals[(i*NNB + j)*NDIM + k] = -normals[(i*NNB + j)*NDIM + k];

		// fill in remaining data
		int last = nel-1;
		for(int i = nel; i < nelr; i++)
			areas[i] = areas[last];
			for(int j = 0; j < NNB; j++)
				// duplicate the last element
				elements_surrounding_elements[i*NNB + j] = elements_surrounding_elements[last*NNB + j];
				for(int k = 0; k < NDIM; k++) normals[(i*NNB + j)*NDIM + k] = normals[(last*NNB + j)*NDIM + k];

	// Create arrays and set initial conditions
	double* variables = alloc<double>(nelr*NVAR);
	initialize_variables(nelr, variables);

	double* old_variables = alloc<double>(nelr*NVAR);
	double* fluxes = alloc<double>(nelr*NVAR);
	double* step_factors = alloc<double>(nelr);

	// these need to be computed the first time in order to compute time step
	std::cout << "Starting..." << std::endl;

	// Begin iterations
	for(int i = 0; i < iterations; i++)
		copy(old_variables, variables, nelr*NVAR);

		// for the first iteration we compute the time step
		compute_step_factor(nelr, variables, areas, step_factors);

		for(int j = 0; j < RK; j++)
			compute_flux(nelr, elements_surrounding_elements, normals, variables, fluxes);
			time_step(j, nelr, old_variables, variables, step_factors, fluxes);

	std::cout << "Saving solution..." << std::endl;
	dump(variables, nel, nelr);
	std::cout << "Saved solution..." << std::endl;

	std::cout << "Cleaning up..." << std::endl;

    } ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);

	std::cout << "Done..." << std::endl;

	return 0;
Exemple #15
int main (int argc, char *argv[]) {
  /**** Initialising ****/
const unsigned long long full_program_start = current_time_ns();
  shmem_init (); 
  /* Variable Declarations */

  int 	     Numprocs,MyRank, Root = 0;
  int 	     i,j,k, NoofElements, NoofElements_Bloc,
  int 	     count, temp;
  TYPE 	     *Input, *InputData;
  TYPE 	     *Splitter, *AllSplitter;
  TYPE 	     *Buckets, *BucketBuffer, *LocalBucket;
  TYPE 	     *OutputBuffer, *Output;
  MyRank = shmem_my_pe ();
  Numprocs = shmem_n_pes ();
  NoofElements = SIZE;

  if(( NoofElements % Numprocs) != 0){
    if(MyRank == Root)
      printf("Number of Elements are not divisible by Numprocs \n");
    shmem_finalize ();
  /**** Reading Input ****/
  Input = (TYPE *) shmem_malloc (NoofElements*sizeof(*Input));
  if(Input == NULL) {
    printf("Error : Can not allocate memory \n");

  if (MyRank == Root){
    /* Initialise random number generator  */ 
    printf ("Generating input Array for Sorting %d uint64_t numbers\n",SIZE);
    for(i=0; i< NoofElements; i++) {
      Input[i] = rand();

  /**** Sending Data ****/

  NoofElements_Bloc = NoofElements / Numprocs;
  InputData = (TYPE *) shmem_malloc (NoofElements_Bloc * sizeof (*InputData));
  if(InputData == NULL) {
    printf("Error : Can not allocate memory \n");
  //MPI_Scatter(Input, NoofElements_Bloc, TYPE_MPI, InputData, 
  //				  NoofElements_Bloc, TYPE_MPI, Root, MPI_COMM_WORLD);

  if(MyRank == Root) {
    for(i=0; i<Numprocs; i++) {
      TYPE* start = &Input[i * NoofElements_Bloc];
      shmem_put64(InputData, start, NoofElements_Bloc, i);

  /**** Sorting Locally ****/
  sorting(InputData, NoofElements_Bloc);

  /**** Choosing Local Splitters ****/
  Splitter = (TYPE *) shmem_malloc (sizeof (TYPE) * (Numprocs-1));
  if(Splitter == NULL) {
    printf("Error : Can not allocate memory \n");
  for (i=0; i< (Numprocs-1); i++){
        Splitter[i] = InputData[NoofElements/(Numprocs*Numprocs) * (i+1)];

  /**** Gathering Local Splitters at Root ****/
  AllSplitter = (TYPE *) shmem_malloc (sizeof (TYPE) * Numprocs * (Numprocs-1));
  if(AllSplitter == NULL) {
    printf("Error : Can not allocate memory \n");
  //MPI_Gather (Splitter, Numprocs-1, TYPE_MPI, AllSplitter, Numprocs-1, 
  //				  TYPE_MPI, Root, MPI_COMM_WORLD);
  TYPE* target_index = &AllSplitter[MyRank * (Numprocs-1)];
  shmem_put64(target_index, Splitter, Numprocs-1, Root);

  /**** Choosing Global Splitters ****/
  if (MyRank == Root){
    sorting (AllSplitter, Numprocs*(Numprocs-1));

    for (i=0; i<Numprocs-1; i++)
      Splitter[i] = AllSplitter[(Numprocs-1)*(i+1)];
  /**** Broadcasting Global Splitters ****/
  //MPI_Bcast (Splitter, Numprocs-1, TYPE_MPI, 0, MPI_COMM_WORLD);
  { int _i; for(_i=0; _i<_SHMEM_BCAST_SYNC_SIZE; _i++) { pSync[_i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); }
  shmem_broadcast64(Splitter, Splitter, Numprocs-1, 0, 0, 0, Numprocs, pSync);

  /**** Creating Numprocs Buckets locally ****/
  Buckets = (TYPE *) shmem_malloc (sizeof (TYPE) * (NoofElements + Numprocs));  
  if(Buckets == NULL) {
    printf("Error : Can not allocate memory \n");
  j = 0;
  k = 1;

  for (i=0; i<NoofElements_Bloc; i++){
    if(j < (Numprocs-1)){
       if (InputData[i] < Splitter[j]) 
			 Buckets[((NoofElements_Bloc + 1) * j) + k++] = InputData[i]; 
	       Buckets[(NoofElements_Bloc + 1) * j] = k-1;
       Buckets[((NoofElements_Bloc + 1) * j) + k++] = InputData[i];
  Buckets[(NoofElements_Bloc + 1) * j] = k - 1;
  /**** Sending buckets to respective processors ****/

  BucketBuffer = (TYPE *) shmem_malloc (sizeof (TYPE) * (NoofElements + Numprocs));
  if(BucketBuffer == NULL) {
    printf("Error : Can not allocate memory \n");

  //MPI_Alltoall (Buckets, NoofElements_Bloc + 1, TYPE_MPI, BucketBuffer, 
  //					 NoofElements_Bloc + 1, TYPE_MPI, MPI_COMM_WORLD);
  for(i=0; i<Numprocs; i++) {
    shmem_put64(&BucketBuffer[MyRank*(NoofElements_Bloc + 1)], &Buckets[i*(NoofElements_Bloc + 1)],  NoofElements_Bloc + 1, i);   

  /**** Rearranging BucketBuffer ****/
  LocalBucket = (TYPE *) shmem_malloc (sizeof (TYPE) * 2 * NoofElements / Numprocs);
  if(LocalBucket == NULL) {
    printf("Error : Can not allocate memory \n");

  count = 1;

  for (j=0; j<Numprocs; j++) {
  k = 1;
    for (i=0; i<BucketBuffer[(NoofElements/Numprocs + 1) * j]; i++) 
      LocalBucket[count++] = BucketBuffer[(NoofElements/Numprocs + 1) * j + k++];
  LocalBucket[0] = count-1;
  /**** Sorting Local Buckets using Bubble Sort ****/
  /*sorting (InputData, NoofElements_Bloc, sizeof(int), intcompare); */

  NoElementsToSort = LocalBucket[0];
  sorting (&LocalBucket[1], NoElementsToSort); 

  /**** Gathering sorted sub blocks at root ****/
  OutputBuffer = (TYPE *) shmem_malloc (sizeof(TYPE) * 2 * NoofElements);
  if(OutputBuffer == NULL) {
    printf("Error : Can not allocate memory \n");

  //MPI_Gather (LocalBucket, 2*NoofElements_Bloc, TYPE_MPI, OutputBuffer, 
  //				  2*NoofElements_Bloc, TYPE_MPI, Root, MPI_COMM_WORLD);
  target_index = &OutputBuffer[MyRank * (2*NoofElements_Bloc)];
  shmem_put64(target_index, LocalBucket, 2*NoofElements_Bloc, Root);

  /**** Rearranging output buffer ****/
  if (MyRank == Root){
    Output = (TYPE *) malloc (sizeof (TYPE) * NoofElements);
    count = 0;
    for(j=0; j<Numprocs; j++){
      k = 1;
      for(i=0; i<OutputBuffer[(2 * NoofElements/Numprocs) * j]; i++) 
        Output[count++] = OutputBuffer[(2*NoofElements/Numprocs) * j + k++];
       printf ( "Number of Elements to be sorted : %d \n", NoofElements);
       TYPE prev = 0;
       int fail = 0;
       for (i=0; i<NoofElements; i++){
         if(Output[i] < prev) { printf("Failed at index %d\n",i); fail = 1; }
         prev = Output[i];
       if(fail) printf("Sorting FAILED\n");  
       else  printf("Sorting PASSED\n");
  }/* MyRank==0*/


   /**** Finalize ****/
  } ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);

void compute_flux(int nelr, int* elements_surrounding_elements, double* normals, double* variables, double* fluxes)
	double smoothing_coefficient = double(0.2f);

 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for default(shared) schedule(static)
for(int i = 0; i < nelr; i++)
		int j, nb;
		cfd_double3 normal; double normal_len;
		double factor;

		double density_i = variables[NVAR*i + VAR_DENSITY];
		cfd_double3 momentum_i;
		momentum_i.x = variables[NVAR*i + (VAR_MOMENTUM+0)];
		momentum_i.y = variables[NVAR*i + (VAR_MOMENTUM+1)];
		momentum_i.z = variables[NVAR*i + (VAR_MOMENTUM+2)];

		double density_energy_i = variables[NVAR*i + VAR_DENSITY_ENERGY];

		cfd_double3 velocity_i;             				 compute_velocity(density_i, momentum_i, velocity_i);
		double speed_sqd_i                          = compute_speed_sqd(velocity_i);
		double speed_i                              = std::sqrt(speed_sqd_i);
		double pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
		double speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
		cfd_double3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z;
		cfd_double3 flux_contribution_i_density_energy;
		compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z, flux_contribution_i_density_energy);

		double flux_i_density = double(0.0);
		cfd_double3 flux_i_momentum;
		flux_i_momentum.x = double(0.0);
		flux_i_momentum.y = double(0.0);
		flux_i_momentum.z = double(0.0);
		double flux_i_density_energy = double(0.0);

		cfd_double3 velocity_nb;
		double density_nb, density_energy_nb;
		cfd_double3 momentum_nb;
		cfd_double3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z;
		cfd_double3 flux_contribution_nb_density_energy;
		double speed_sqd_nb, speed_of_sound_nb, pressure_nb;

		for(j = 0; j < NNB; j++)
			nb = elements_surrounding_elements[i*NNB + j];
			normal.x = normals[(i*NNB + j)*NDIM + 0];
			normal.y = normals[(i*NNB + j)*NDIM + 1];
			normal.z = normals[(i*NNB + j)*NDIM + 2];
			normal_len = std::sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);

			if(nb >= 0) 	// a legitimate neighbor
				density_nb =        variables[nb*NVAR + VAR_DENSITY];
				momentum_nb.x =     variables[nb*NVAR + (VAR_MOMENTUM+0)];
				momentum_nb.y =     variables[nb*NVAR + (VAR_MOMENTUM+1)];
				momentum_nb.z =     variables[nb*NVAR + (VAR_MOMENTUM+2)];
				density_energy_nb = variables[nb*NVAR + VAR_DENSITY_ENERGY];
													compute_velocity(density_nb, momentum_nb, velocity_nb);
				speed_sqd_nb                      = compute_speed_sqd(velocity_nb);
				pressure_nb                       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
				speed_of_sound_nb                 = compute_speed_of_sound(density_nb, pressure_nb);
													compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);

				// artificial viscosity
				factor = -normal_len*smoothing_coefficient*double(0.5)*(speed_i + std::sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
				flux_i_density += factor*(density_i-density_nb);
				flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
				flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
				flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
				flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);

				// accumulate cell-centered fluxes
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(momentum_nb.x+momentum_i.x);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(momentum_nb.y+momentum_i.y);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(momentum_nb.z+momentum_i.z);
				flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z);
				flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z);
				flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z);
				flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z);
			else if(nb == -1)	// a wing boundary
				flux_i_momentum.x += normal.x*pressure_i;
				flux_i_momentum.y += normal.y*pressure_i;
				flux_i_momentum.z += normal.z*pressure_i;
			else if(nb == -2) // a far field boundary
				factor = double(0.5)*normal.x;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.x+flux_contribution_i_density_energy.x);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.x + flux_contribution_i_momentum_x.x);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.x + flux_contribution_i_momentum_y.x);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.x + flux_contribution_i_momentum_z.x);

				factor = double(0.5)*normal.y;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.y+flux_contribution_i_density_energy.y);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.y + flux_contribution_i_momentum_x.y);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.y + flux_contribution_i_momentum_y.y);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.y + flux_contribution_i_momentum_z.y);

				factor = double(0.5)*normal.z;
				flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
				flux_i_density_energy += factor*(ff_flux_contribution_density_energy.z+flux_contribution_i_density_energy.z);
				flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x.z + flux_contribution_i_momentum_x.z);
				flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y.z + flux_contribution_i_momentum_y.z);
				flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z.z + flux_contribution_i_momentum_z.z);


		fluxes[i*NVAR + VAR_DENSITY] = flux_i_density;
		fluxes[i*NVAR + (VAR_MOMENTUM+0)] = flux_i_momentum.x;
		fluxes[i*NVAR + (VAR_MOMENTUM+1)] = flux_i_momentum.y;
		fluxes[i*NVAR + (VAR_MOMENTUM+2)] = flux_i_momentum.z;
		fluxes[i*NVAR + VAR_DENSITY_ENERGY] = flux_i_density_energy;
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma186_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

Exemple #17
unsigned long long hclib_current_time_ns() {
    return current_time_ns();
* The implementation of the particle filter using OpenMP for many frames
* @see http://openmp.org/wp/
* @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods
* @param I The video to be run
* @param IszX The x dimension of the video
* @param IszY The y dimension of the video
* @param Nfr The number of frames
* @param seed The seed array used for random number generation
* @param Nparticles The number of particles to be used
void particleFilter(int * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles){
	int max_size = IszX*IszY*Nfr;
	long long start = get_time();
	//original particle centroid
	double xe = roundDouble(IszY/2.0);
	double ye = roundDouble(IszX/2.0);
	//expected object locations, compared to center
	int radius = 5;
	int diameter = radius*2 - 1;
	int * disk = (int *)malloc(diameter*diameter*sizeof(int));
	strelDisk(disk, radius);
	int countOnes = 0;
	int x, y;
	for(x = 0; x < diameter; x++){
		for(y = 0; y < diameter; y++){
			if(disk[x*diameter + y] == 1)
	double * objxy = (double *)malloc(countOnes*2*sizeof(double));
	getneighbors(disk, countOnes, objxy, radius);
	long long get_neighbors = get_time();
	printf("TIME TO GET NEIGHBORS TOOK: %f\n", elapsed_time(start, get_neighbors));
	//initial weights are all equal (1/Nparticles)
	double * weights = (double *)malloc(sizeof(double)*Nparticles);
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(weights, Nparticles) private(x)
for(x = 0; x < Nparticles; x++){
		weights[x] = 1/((double)(Nparticles));
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma373_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

	long long get_weights = get_time();
	printf("TIME TO GET WEIGHTSTOOK: %f\n", elapsed_time(get_neighbors, get_weights));
	//initial likelihood to 0.0
	double * likelihood = (double *)malloc(sizeof(double)*Nparticles);
	double * arrayX = (double *)malloc(sizeof(double)*Nparticles);
	double * arrayY = (double *)malloc(sizeof(double)*Nparticles);
	double * xj = (double *)malloc(sizeof(double)*Nparticles);
	double * yj = (double *)malloc(sizeof(double)*Nparticles);
	double * CDF = (double *)malloc(sizeof(double)*Nparticles);
	double * u = (double *)malloc(sizeof(double)*Nparticles);
	int * ind = (int*)malloc(sizeof(int)*countOnes*Nparticles);
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(arrayX, arrayY, xe, ye) private(x)
for(x = 0; x < Nparticles; x++){
		arrayX[x] = xe;
		arrayY[x] = ye;
	} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma388_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

	int k;
	printf("TIME TO SET ARRAYS TOOK: %f\n", elapsed_time(get_weights, get_time()));
	int indX, indY;
	for(k = 1; k < Nfr; k++){
		long long set_arrays = get_time();
		//apply motion model
		//draws sample from motion model (random walk). The only prior information
		//is that the object moves 2x as fast as in the y direction
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(arrayX, arrayY, Nparticles, seed) private(x)
for(x = 0; x < Nparticles; x++){
			arrayX[x] += 1 + 5*randn(seed, x);
			arrayY[x] += -2 + 2*randn(seed, x);
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma402_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long error = get_time();
		printf("TIME TO SET ERROR TOOK: %f\n", elapsed_time(set_arrays, error));
		//particle filter likelihood
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(likelihood, I, arrayX, arrayY, objxy, ind) private(x, y, indX, indY)
for(x = 0; x < Nparticles; x++){
			//compute the likelihood: remember our assumption is that you know
			// foreground and the background image intensity distribution.
			// Notice that we consider here a likelihood ratio, instead of
			// p(z|x). It is possible in this case. why? a hometask for you.		
			//calc ind
			for(y = 0; y < countOnes; y++){
				indX = roundDouble(arrayX[x]) + objxy[y*2 + 1];
				indY = roundDouble(arrayY[x]) + objxy[y*2];
				ind[x*countOnes + y] = fabs((double)(indX*IszY*Nfr + indY*Nfr + k));
				if(ind[x*countOnes + y] >= max_size)
					ind[x*countOnes + y] = 0;
			likelihood[x] = 0;
			for(y = 0; y < countOnes; y++)
				likelihood[x] += (pow((I[ind[x*countOnes + y]] - 100),2) - pow((I[ind[x*countOnes + y]]-228),2))/50.0;
			likelihood[x] = likelihood[x]/((double) countOnes);
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma410_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long likelihood_time = get_time();
		printf("TIME TO GET LIKELIHOODS TOOK: %f\n", elapsed_time(error, likelihood_time));
		// update & normalize weights
		// using equation (63) of Arulampalam Tutorial
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(Nparticles, weights, likelihood) private(x)
for(x = 0; x < Nparticles; x++){
			weights[x] = weights[x] * exp(likelihood[x]);
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma433_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long exponential = get_time();
		printf("TIME TO GET EXP TOOK: %f\n", elapsed_time(likelihood_time, exponential));
		double sumWeights = 0;
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for private(x) reduction(+:sumWeights)
for(x = 0; x < Nparticles; x++){
			sumWeights += weights[x];
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma440_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long sum_time = get_time();
		printf("TIME TO SUM WEIGHTS TOOK: %f\n", elapsed_time(exponential, sum_time));
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(sumWeights, weights) private(x)
for(x = 0; x < Nparticles; x++){
			weights[x] = weights[x]/sumWeights;
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma446_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long normalize = get_time();
		printf("TIME TO NORMALIZE WEIGHTS TOOK: %f\n", elapsed_time(sum_time, normalize));
		xe = 0;
		ye = 0;
		// estimate the object location by expected values
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for private(x) reduction(+:xe, ye)
for(x = 0; x < Nparticles; x++){
			xe += arrayX[x] * weights[x];
			ye += arrayY[x] * weights[x];
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma455_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long move_time = get_time();
		printf("TIME TO MOVE OBJECT TOOK: %f\n", elapsed_time(normalize, move_time));
		printf("XE: %lf\n", xe);
		printf("YE: %lf\n", ye);
		double distance = sqrt( pow((double)(xe-(int)roundDouble(IszY/2.0)),2) + pow((double)(ye-(int)roundDouble(IszX/2.0)),2) );
		printf("%lf\n", distance);
		//display(hold off for now)
		//pause(hold off for now)
		CDF[0] = weights[0];
		for(x = 1; x < Nparticles; x++){
			CDF[x] = weights[x] + CDF[x-1];
		long long cum_sum = get_time();
		printf("TIME TO CALC CUM SUM TOOK: %f\n", elapsed_time(move_time, cum_sum));
		double u1 = (1/((double)(Nparticles)))*randu(seed, 0);
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(u, u1, Nparticles) private(x)
for(x = 0; x < Nparticles; x++){
			u[x] = u1 + x/((double)(Nparticles));
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma480_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long u_time = get_time();
		printf("TIME TO CALC U TOOK: %f\n", elapsed_time(cum_sum, u_time));
		int j, i;
 { const unsigned long long parallel_for_start = current_time_ns();
#pragma omp parallel for shared(CDF, Nparticles, xj, yj, u, arrayX, arrayY) private(i, j)
for(j = 0; j < Nparticles; j++){
			i = findIndex(CDF, Nparticles, u[j]);
			if(i == -1)
				i = Nparticles-1;
			xj[j] = arrayX[i];
			yj[j] = arrayY[i];
		} ; 
const unsigned long long parallel_for_end = current_time_ns();
printf("pragma488_omp_parallel %llu ns\n", parallel_for_end - parallel_for_start); } 

		long long xyj_time = get_time();
		printf("TIME TO CALC NEW ARRAY X AND Y TOOK: %f\n", elapsed_time(u_time, xyj_time));
		//#pragma omp parallel for shared(weights, Nparticles) private(x)
		for(x = 0; x < Nparticles; x++){
			//reassign arrayX and arrayY
			arrayX[x] = xj[x];
			arrayY[x] = yj[x];
			weights[x] = 1/((double)(Nparticles));
		long long reset = get_time();
		printf("TIME TO RESET WEIGHTS TOOK: %f\n", elapsed_time(xyj_time, reset));
int main(int argc, char * argv[]){
	char* usage = "openmp.out -x <dimX> -y <dimY> -z <Nfr> -np <Nparticles>";
	//check number of arguments
	if(argc != 9)
		printf("%s\n", usage);
		return 0;
	//check args deliminators
	if( strcmp( argv[1], "-x" ) ||  strcmp( argv[3], "-y" ) || strcmp( argv[5], "-z" ) || strcmp( argv[7], "-np" ) ) {
		printf( "%s\n",usage );
		return 0;
	int IszX, IszY, Nfr, Nparticles;
	//converting a string to a integer
	if( sscanf( argv[2], "%d", &IszX ) == EOF ) {
	   printf("ERROR: dimX input is incorrect");
	   return 0;
	if( IszX <= 0 ) {
		printf("dimX must be > 0\n");
		return 0;
	//converting a string to a integer
	if( sscanf( argv[4], "%d", &IszY ) == EOF ) {
	   printf("ERROR: dimY input is incorrect");
	   return 0;
	if( IszY <= 0 ) {
		printf("dimY must be > 0\n");
		return 0;
	//converting a string to a integer
	if( sscanf( argv[6], "%d", &Nfr ) == EOF ) {
	   printf("ERROR: Number of frames input is incorrect");
	   return 0;
	if( Nfr <= 0 ) {
		printf("number of frames must be > 0\n");
		return 0;
	//converting a string to a integer
	if( sscanf( argv[8], "%d", &Nparticles ) == EOF ) {
	   printf("ERROR: Number of particles input is incorrect");
	   return 0;
	if( Nparticles <= 0 ) {
		printf("Number of particles must be > 0\n");
		return 0;
	//establish seed
	int * seed = (int *)malloc(sizeof(int)*Nparticles);
	int i;
	for(i = 0; i < Nparticles; i++)
		seed[i] = time(0)*i;
	//malloc matrix
	int * I = (int *)malloc(sizeof(int)*IszX*IszY*Nfr);
	long long start = get_time();
	//call video sequence
	videoSequence(I, IszX, IszY, Nfr, seed);
	long long endVideoSequence = get_time();
	printf("VIDEO SEQUENCE TOOK %f\n", elapsed_time(start, endVideoSequence));
	//call particle filter
const unsigned long long full_program_start = current_time_ns();
particleFilter(I, IszX, IszY, Nfr, seed, Nparticles) ; 
const unsigned long long full_program_end = current_time_ns();
printf("full_program %llu ns\n", full_program_end - full_program_start);

	long long endParticleFilter = get_time();
	printf("PARTICLE FILTER TOOK %f\n", elapsed_time(endVideoSequence, endParticleFilter));
	printf("ENTIRE PROGRAM TOOK %f\n", elapsed_time(start, endParticleFilter));
	return 0;
Exemple #20
unsigned long long hclib_current_time_ms() {
    return current_time_ns() / 1000000;