Beispiel #1
0
int main(int argc, char **argv) {
  // Serial code
  int a=0;
  int local_sense=1;
  int i,j;
  
  if(argc!=3) {
	printf("Usage: spin_sense <Number-of-OpenMP-threads> <Num-of-Barriers>\n");
	exit(-1);
  }

  num_threads=atoi( argv[1] );
  num_barriers=atoi(argv[2]);
  if (num_barriers > 1000) num_barriers = 1000;
  count=num_threads;
  double start[num_barriers][num_threads],end[num_barriers][num_threads],barrier_time[num_barriers],maxstart[num_barriers],maxend[num_barriers], avg_time = 0.0f;
  
  for(i=0; i<num_barriers; i++) {
	maxstart[i]=0.0f;
	maxend[i]=0.0f;
	barrier_time[i]=0.0f;
  }
  
  if(DEBUG == 1)	printf("This is the serial section\n");
  omp_set_num_threads(num_threads);

  #pragma omp parallel shared(a) private(i) firstprivate(local_sense)
  {
    // Now we're in the parallel section
    int thread_num = omp_get_thread_num();
		
	for (i=0; i<num_barriers; i++) {
		#pragma omp critical
		{
			a++;
		}
		if(DEBUG == 1)	printf("a=%d in thread %d before barrier # %d.\n",a,thread_num,i+1);
		start[i][thread_num] = omp_get_wtime();
		sense_barrier(&local_sense,thread_num);
		end[i][thread_num] = omp_get_wtime();
		if(DEBUG == 1)	printf("a=%d in thread %d after barrier # %d.\n",a,thread_num,i+1);
	}
  }
  
  for(i=0;i<num_barriers;i++) {
	  for(j=0;j<num_threads;j++) {
		if(start[i][j]>maxstart[i])	maxstart[i]=start[i][j];
		if(end[i][j]>maxend[i])	maxend[i]=end[i][j];
	  }
	  barrier_time[i]=maxend[i]-maxstart[i];
	  if(DEBUG == 1)	printf("Barrier Time of barrier #%d = %lf\n",i+1,barrier_time[i]);
  }
  for(i=0;i<num_barriers;i++) {
	avg_time += barrier_time[i];
  }
  avg_time = avg_time/num_barriers;
  printf("Barrier_Time=%lf\n",avg_time);
  
  // Resume serial code
  if(DEBUG == 1)	printf("Back in the serial section again\n");
  return 0;
}
Beispiel #2
0
int main(int argc, char const *argv[])
{
	char* s;
	std::srand(std::time(0)); //use current time as seed for random generator
	int r = rand() % 1000;
	for(int i = 0; i < r; i++)
	{
		rand();
	}
	if(argc < 3)
	{
		return 1;
	}
	
	int forestSize = strtol(argv[1], &s, 10);
	int iterations = strtol(argv[2], &s, 10);

	double SIDE = std::sqrt(forestSize);
	SIDE = fRand(std::sqrt(SIDE),std::sqrt(2)*SIDE);
	double R = 1;

	double begin, end;

	std::vector<int> empty;

	std::vector<Tree*> Forest;
	std::vector< std::vector<int> > neighbors(forestSize,empty);
	std::vector<double> metrics(forestSize,0.0);

	int num_threads;

	std::vector<int> systems_processed; // DEBUG
    std::vector<int> symbols_translated; // DEBUG

///// PARALLEL BLOCK
	begin = omp_get_wtime();
	#pragma omp parallel shared(Forest,neighbors,metrics,num_threads)
	{
		#pragma omp master
		{
			// INIT VARIABLES
			std::vector<Point> positions;

			num_threads = omp_get_num_threads();
			std::cout << "Running " << forestSize << " trees for " << iterations << " iterations on " << num_threads << " processors" << std::endl;
			
			for(int i = 0; i < forestSize; i++)
			{
				double x = fRand(0,SIDE);
				double y = fRand(0,SIDE);
				Point p = {x,y};
				Tree *T = new MonopodialTree();
				Forest.push_back(T);
				positions.push_back(p);
				for(int j = 0 ; j < i ; j++)
				{
					Point q = positions[j];
					if(pointDistance(p,q) < R)
					{
						neighbors[j].push_back(i);
						neighbors[i].push_back(j);
					}
				}
			}

			systems_processed = std::vector<int>(num_threads,0); //DEBUG
			symbols_translated = std::vector<int>(num_threads,0); //DEBUG
		}

		#pragma omp barrier

		int thread_num = omp_get_thread_num();
		// ITERATE
		for(int j = 0 ; j < iterations ; j++)
		{
			#pragma omp for schedule(dynamic)
			for(int i = 0; i < Forest.size() ; i++)
			{
				Forest[i]->next();
				double metric = Forest[i]->calculateMetric();
				metrics[i] = metric;
				systems_processed[thread_num]++; //DEBUG
				symbols_translated[thread_num] += Forest[i]->getState().size(); //DEBUG
			}

			#pragma omp for schedule(dynamic)
			for(int i = 0; i < Forest.size() ; i++)
			{
				Forest[i]->updateMetric(metrics,neighbors[i]);
			}
		}
	}
///// PARALLEL BLOCK
	end = omp_get_wtime();

	std::vector< std::vector<int> > connected_components = get_connected_components(neighbors);
	// print_forest(Forest, neighbors, metrics); // VERBOSE
	// print_connected_components( connected_components); // VERBOSE


	char buffer[80];

	FILE *f = fopen("Results_naive.txt", "a");
	if(f != NULL)
	{
	    fprintf(f, "%s\n", gettime(buffer));
	    fprintf(f,"%d threads\n",num_threads);
	    fprintf(f,"%d trees\n",forestSize);
	    fprintf(f,"%d iterations\n",iterations);
	    fprintf(f,"%lf %lf\n",SIDE,R);
	    for(int i = 0; i < connected_components.size(); i++)
	    {
	    	fprintf(f, "%d ", connected_components[i].size());
	    }
	    fprintf(f, "\n");
	    fprintf(f,"Proc   Systems   Symbols\n");//DEBUG
	    for(int i = 0; i < num_threads; i++)//DEBUG
	    {//DEBUG
	        fprintf(f,"  %02d  %03d  %03d\n",i,systems_processed[i],symbols_translated[i]);//DEBUG
	    }//DEBUG
	    fprintf(f,"Time : %f seconds\n", end-begin);
	    fprintf(f,"\n=====================\n");
	}

	for(int i = 0; i < Forest.size() ; i++)
	{
		delete Forest[i];
	}


	return 0;
}
int main ( int argc, char *argv[] )

/******************************************************************************/
/*
*/
{
# define M 12
# define N 12
# define ITER 1000

  int i, j, cur, temp_i, temp_j;
  
  double epsilon = 0.001;
  double mean = 0.0;
  double diff, my_diff;
  

  double u[M][N];

/*
* Begin setup of the array. 
*/
  #pragma omp parallel shared( u ) private(i, j) reduction(+ : mean)
  {
    #pragma omp for
    for ( i = 1; i < M - 1; i++ )
    {
      u[i][0] = 100.0;
    }
    #pragma omp for
    for ( i = 1; i < M - 1; i++ )
    {
      u[i][N-1] = 100.0;
    }
    #pragma omp for
    for ( j = 0; j < N; j++ )
    {
      u[M-1][j] = 100.0;
    }
    #pragma omp for
    for ( j = 0; j < N; j++ )
    {
      u[0][j] = 0.0;
    }
/*
  Average the boundary values, to come up with a reasonable
  initial value for the interior.
*/
    #pragma omp for
    for ( i = 1; i < M - 1; i++ )
    {
      mean = mean + u[i][0];
    }
    #pragma omp for
    for ( i = 1; i < M - 1; i++ )
    {
      mean = mean + u[i][N-1];
    }
    #pragma omp for
    for ( j = 0; j < N; j++ )
    {
      mean = mean + u[M-1][j];
    }
    #pragma omp for
    for ( j = 0; j < N; j++ )
    {
      mean = mean + u[0][j];
    }
  }
  mean = mean / ( double ) ( 2 * M + 2 * N - 4 );
  printf ( "\n" );
  printf ( "  MEAN = %f\n", mean );
/* 
  Initialize the interior solution to the mean value.
*/
  #pragma omp parallel shared( u ) private(i, j)
  {
    #pragma omp for
    for(i = 1; i < M -1; i++)
    {
      for(j = 1; j < N -1; j++)
      {
        u[i][j] = mean;
      }
    }
  }

  printf(" MEAN = %f\n", mean);

  /*
  * End array setup so at this point our array contains the values
  * that it starts with.
  */
  diff = epsilon;  
  int iteration_number = 0;
  int run = 1;
  double wtime = omp_get_wtime();
  while(run)
  {
    int cont = 0;
    my_diff = 0.0;
    printf("Currently running on iteration number %d with diff %f\n", iteration_number, diff);
    diff = 0.0;
    iteration_number++;
    #pragma omp parallel shared(u, diff) private(i, j, cur, mean, temp_i, temp_j) reduction(+ : cont)
    {
      srand((int)time(NULL) ^ omp_get_thread_num());
      for(i = 1; i < M-1; i++)
      {
        #pragma omp for
        for(j = 1; j < N-1; j++)
        {
          mean = 0.0;
          for(cur = 0; cur < ITER; cur++)
          {
            temp_i = i;
            temp_j = j;
            while(1)
            {
              int direction = rand()%4;
              //Go towards the i = 0 row 
              if(direction == 0)
              {
                temp_i--;
                if(temp_i == 0){mean += 0.0; break;}
              }
              //Go towards the j = 0 col 
              else if(direction == 1)
              {
                temp_j--;
                if(temp_j == 0){mean += 100.0; break;}
              }
              //Go towards the i = M row 
              else if(direction == 2)
              {
                temp_i++;
                if(temp_i == (M-1)){mean += 100.0; break;}
              }
              //Go towards the j = N col 
              else
              {
                temp_j++;
                if(temp_j == (N-1)){mean += 100.0; break;}
              }
            }
          }
          double old = u[i][j];
          if(iteration_number == 0)
          {
             u[i][j] = (double) (u[i][j] + mean)/(ITER + 1);
          }
          else
          {
            double cur_iter = (double) iteration_number * ITER;
            double prev_avg = (double) cur_iter * u[i][j];
            u[i][j] = (double) (prev_avg + mean) / (cur_iter + ITER); 
          }
          if( fabs(old - u[i][j])  > epsilon)
          {
            if( fabs(old - u[i][j]) > my_diff)
            {
              my_diff = fabs(old - u[i][j]);
            }
            cont++;
          }
        }
      }
    #pragma omp critical
    {
      if(my_diff > diff){diff = my_diff;}
    }
    }
    if(cont == 0){run = 0;}
  }
  wtime = omp_get_wtime() - wtime;
  printf("Time taken %f\n", wtime);
  return 0;

# undef M
# undef N
}
int main(int argc, char** argv)
{
	int i, j;
	double t1, t2, total;

	//Leer argumento de entrada (no de componentes del vector)
	if (argc<2){
		printf("Falta tamaño de matriz y vector\n");
		exit(-1);
	}

	unsigned int N = atoi(argv[1]); // Máximo N =2^32-1=4294967295 (sizeof(unsigned int) = 4 B)

	double *v1, *v2, **M;
	v1 = (double*) malloc(N*sizeof(double));// malloc necesita el tamaño en bytes
	v2 = (double*) malloc(N*sizeof(double)); //si no hay espacio suficiente malloc devuelve NULL
	M = (double**) malloc(N*sizeof(double *));
	if ( (v1==NULL) || (v2==NULL) || (M==NULL) ){
		printf("Error en la reserva de espacio para los vectores\n");
		exit(-2);
	}

	for (i=0; i<N; i++){
		M[i] = (double*) malloc(N*sizeof(double));
		if ( M[i]==NULL ){
			printf("Error en la reserva de espacio para los vectores\n");
			exit(-2);
		}
	}
	//A partir de aqui se pueden acceder las componentes de la matriz como M[i][j]

	//Inicializar matriz y vectores
	#pragma omp parallel
	{
		#pragma omp for private(j)
		for (i=0; i<N;i++)
		{
			v1[i] = i;
			v2[i] = 0;
			for(j=0;j<N;j++)
				M[i][j] = i+j;
		}

		//Medida de tiempo
		#pragma omp single
		t1 = omp_get_wtime();

		//Calcular producto de matriz por vector v2 = M · v1

		#pragma omp for private(j)
		for (i=0; i<N;i++)
			for(j=0;j<N;j++)
				v2[i] += M[i][j] * v1[j];

		//Medida de tiempo
		#pragma omp single
		t2 = omp_get_wtime();
	}

	total = t2 - t1;

	//Imprimir el resultado y el tiempo de ejecución
	printf("Tiempo(seg.):%11.9f\t / Tamaño:%u\t/ V2[0]=%8.6f V2[%d]=%8.6f\n", total,N,v2[0],N-1,v2[N-1]);

	// Imprimir todos los componentes de v2 (solo si es razonable el tamaño)
	if (N<20)
		for (i=0; i<N;i++)
			printf(" V2[%d]=%5.2f\n", i, v2[i]);



	free(v1); // libera el espacio reservado para v1
	free(v2); // libera el espacio reservado para v2
	for (i=0; i<N; i++)
		free(M[i]);
	free(M);

	return 0;
}
int main(int argc, char* argv[])   
{      
  double before, time1, time2;
  int M = MM;
  int N = NN;
  int P = PP;
 
  if (argc != 4) {
          printf("Suggested Usage: %s <M> <N> <P> \n", argv[0]);
     printf("Using default values\n");
  }
  else {
     M = atoi(argv[1]);
     N = atoi(argv[2]);
     P = atoi(argv[3]);
  }

  double **A = Allocate2DArray< double >(M, P);
  double **B = Allocate2DArray< double >(P, N);
  double **C = Allocate2DArray< double >(M, N);
  double **C4 = Allocate2DArray< double >(M, N);

  int i, j;   

  for (i = 0; i < M; ++i) {   
    for (j = 0; j < P; ++j) {   
      A[i][j] = 5.0 - ((double)(rand()%100) / 10.0);  
    }      
  }   

  for (i = 0; i < P; ++i) {   
    for (j = 0; j < N; ++j) {   
      B[i][j] = 5.0 - ((double)(rand()%100) / 10.0);   
    }      
  }   

  for (i = 0; i < M; ++i) {   
    for (j = 0; j < N; ++j) {   
      C[i][j] = 0.0;
      C4[i][j] = 0.0;
    }      
  }   

  printf("Execute Standard matmult  M = %d  N = %d  P = %d\n\n", M, N, P);
  before = omp_get_wtime();
  seqMatMult(M, N, P, A, B, C);
  time1 = omp_get_wtime() - before;
  printf("Standard matrix function done in %7.2f secs\n\n\n",(float)time1);

  before = omp_get_wtime();
  matmultS(M, N, P, A, B, C4);
  time2 = omp_get_wtime() - before;
  printf("Strassen matrix function done in %7.2f secs\n\n\n",time2);

   printf("Checking...");
   if (CheckResults(M, N, C, C4))
     printf("Error in Strassen Matrix Multiplication\n\n");
   else {
     printf("OKAY\n\n");
     printf("Speedup = %5.1fX\n", time1/time2);
   }

  Free2DArray< double >(A);
  Free2DArray< double >(B);
  Free2DArray< double >(C);
  Free2DArray< double >(C4);

  return 0;   
}  
int main(int argc, char **argv)
{
  size_t size;
  fftwf_complex *data;
  fftwf_plan plan;

  if(argc >= 2)
  {
    size = atoi(argv[1]);
    if (size <= 0)
    {
      fprintf(stderr, "ERROR, matrix size <= 0 !\n");
      return EXIT_FAILURE;
    }
  }
  else
  {
    fprintf(stderr, "ERROR, pass matrix size as 1st parameter !\n");
    return EXIT_FAILURE;
  }

  const size_t N = size * size * size;

  data = (fftwf_complex*)_mm_malloc(sizeof(fftw_complex) * N, 64);
  if (data == NULL)
  {
    fprintf(stderr, "ERROR, _mm_malloc() !\n");
    return EXIT_FAILURE;
  }

  PapiCounterList papi_routines;
  papi_routines.AddRoutine("fftw");

  // NUMA First touch
  #pragma omp parallel for
  for (size_t i = 0; i < N; ++i)
    data[i][0] = data[i][1] = 1.0;


  fprintf(stdout, "** FFTW 3D OMP **\n");
  fprintf(stdout, "* OMP_NUM_THREADS: %d\n", omp_get_max_threads());
  fprintf(stdout, "* Size of Matrix: %dx%dx%d\n", (int)size, (int)size, (int)size);

  // fftw threads plan
  fftwf_plan_with_nthreads(omp_get_max_threads());
  // fftw compute plan
  plan = fftwf_plan_dft_3d(size, size, size,
                                      data, data,
                                      FFTW_FORWARD, FFTW_MEASURE);
  papi_routines["fftw"].Start();
  // compute results
  const double tstart = omp_get_wtime();
  fftwf_execute(plan);
  const double tend = omp_get_wtime();
  papi_routines["fftw"].Stop();

  printf("* Wall time: %fs\n\n", tend - tstart);
  papi_routines.PrintScreen();
  // free memory
  _mm_free(data);
  fftwf_destroy_plan(plan);

  return EXIT_SUCCESS;
}
Beispiel #7
0
void facets_conform_dynamic_remove(data_list * data, ptriangulation triang, int iterations, tri_list * check_list, tri_list * check_list_new, omp_lock_t ** locks) {
  int dim = data_list_dim(data);
  //tri_mem_list * list = &data->mem_list;

  cube_points cube = gen_cube_points(dim);
  //Initalize the parameter. Every thread should have it's own copy
  static facet_acute_data parameters;
#pragma omp threadprivate(parameters)
#pragma omp parallel 
  {
    parameters.cube = &cube;
    parameters.boundary_func = &triangle_boundary_cube;
    parameters.data = data;
    parameters.store_acute_ind = 0;
    parameters.acute_ind  = malloc(sizeof(vert_index) * cube.len);
  }


  int iter = 0;
  double time_start, time_check;
  size_t count = 0;
  int triang_consistent = 1;
  while (tri_list_count(check_list) && triang_consistent && (iter != iterations)) //While we have triangles to be removed)
  {
    time_start = omp_get_wtime();
    triangle cur_tri;
    tri_index cur_idx;
    int l,k;
    size_t i,j;
    size_t facets_add_total = 0;


    #pragma omp parallel shared(locks) private(cur_tri, cur_idx, i,j,k,l)
    {
      facets_add_cnt = 0;
      if (omp_get_thread_num() == 0) {
        size_t new_count = data_list_count(data);
        if (count)
          printf("Removed %zu triangles\n", count - new_count);
        printf("\n\nLoop %d of conform dynamic\n", iter++);
        printf("Size of entire list    %zu\n", new_count);
        printf("Size of check list     %zu\n", tri_list_count(check_list));
        tri_list_validate(check_list);
        count = new_count;
      }
      /*
       * Loop over all the triangles in the check list. Check if they are not conform
       * if so, add all the possible new non-conform edges to the tmp_check_list.
       */
      #pragma omp for  schedule(dynamic,dim) 
      for (i = 0; i < cube.len; i++) {
        if (!triang_consistent) 
          continue; //We want break, but that is not possible with openMP

        for (j = i; j < cube.len; j++) {
          if (!triang_consistent)
            break;
          for (l = check_list->t_arr[i][j- i].len - 1; l >= 0; l--) {  //Loop over all triangles (i,j,*)
            k = check_list->t_arr[i][j - i].p_arr[l] + j;
            cur_idx[0] = i;
            cur_idx[1] = j;
            cur_idx[2] = k;
            cur_tri = triangle_from_index_cube(cur_idx, dim);
            //Cur_tri now holds the triangle we should check
            if (!data_list_contains(data, &cur_tri))
              continue;//This triangle was already removed.. Skip :-)

            if (!facet_conform(&cur_tri, &parameters)) { //This triangle is not conform, delete!
              parameters.store_acute_ind = 1;
              facet_conform(&cur_tri, &parameters);
              parameters.store_acute_ind = 0;
              //Add all the sides of conform tetrahedrons with cur_tri as base to the possible non-conform list.
              facets_tetra_list(check_list_new, cur_idx, parameters.acute_ind, parameters.acute_ind_len, locks);
              //Cur_tri is not conform, remove from the data structure.
              if (data->mode == DATA_MEM_LIST_CUBE)
                mem_list_cube_clear(&data->mem_list, &cur_tri);
              else
                tri_list_remove(&data->list, &cur_tri, TRI_LIST_NO_RESIZE);
            }
          }
        }

        if (omp_get_thread_num() == 0)
          triang_consistent = triangulation_consistent(triang, &parameters);
      }

      #pragma omp atomic
      facets_add_total += facets_add_cnt;
    }

    if (triang_consistent) {
      printf("Amount of triangles in new_check_list: +/-   %zu\n", facets_add_total);
      printf("Amount of triangles in new_check_list: exact %zu\n", tri_list_count(check_list_new));
      //Checked all the triangles from check_list. Empty it and swap the lists.
      tri_list_empty(check_list);

      if (iter != iterations) {
        tri_list tmp = *check_list;
        *check_list = *check_list_new;
        *check_list_new = tmp;
      }
    } else
      printf("Triangulation not consistent anymore\n");

    time_check = omp_get_wtime();
    printf("\nTook %f seconds to construct new check list\n",time_check - time_start);
  }

  free(cube.points);
  #pragma omp parallel
  {
    free(parameters.acute_ind);
  }
}
Beispiel #8
0
static void solve(double* density, double& time)
{
	PREV_DENSITY = new double[XY_LEN];
	for (int j = 0; j < OY_LEN + 1; j++)
	{
		for (int i = 0; i < OX_LEN_1; i++)
		{
			PREV_DENSITY[OX_LEN_1 * j + i] = analytical_solution(0, OX[i], OY[j]);
		}
	}

	int i = 0, j = 0, tl = 0;
	double timeStart = 0, timeEnd=0;
#ifdef _OPENMP
	// printf("OPENMP THREADS COUNT = %d\n", omp_get_max_threads());
	long count = 0;
	// dummy parallel section to get all threads running
	#pragma omp parallel private(i,j)
	{
		_InterlockedIncrement(&count);
	}
#endif

#ifdef _OPENMP
//	printf("OPENMP timer function is used!\n");
	timeStart = omp_get_wtime();
#else
//	printf("Standart timer function is used!\n");
	StartTimer();
#endif
	fflush(stdout);
	for (tl = 1; tl <= TIME_STEP_CNT; tl++)
	{
		PREV_TIME = TIME;
		TIME = TAU * tl;
		for (int k = 0; k <= OX_LEN; k++)
		{
			density[k] = analytical_solution(OX[k], BB, TIME);
			density[OX_LEN_1 * OY_LEN + k] = analytical_solution(OX[k], UB, TIME);
		}
		for (int u = 0; u <= OY_LEN; u++)
		{
			density[OX_LEN_1 * u] = analytical_solution(LB, OY[u], TIME);
			density[OX_LEN_1 * u + OX_LEN] = analytical_solution(RB, OY[u], TIME);
		}
#ifdef _OPENMP
	#pragma omp parallel for collapse(2) private(i, j)
#endif
		for (j = 1; j < OY_LEN; ++j)
		{
			for (i = 1; i < OX_LEN; ++i)
			{
				density[OX_LEN_1 * j + i] = integrate(i, j);
				density[OX_LEN_1 * j + i] += TAU * func_f(B, TIME, UB, BB, LB, RB, OX[i], OY[j]);
			}
		}
		memcpy(PREV_DENSITY, density, XY_LEN * sizeof(double));// заменить на быструю версию из agnerasmlib
	}
#ifdef _OPENMP
	timeEnd = omp_get_wtime();
	time = (timeEnd-timeStart);
//	printf("time %f s.\n", time);
#else
	time = GetTimer()/1000;
//	printf("time %f s.\n", time/1000);
#endif
	delete [] PREV_DENSITY;
}
Beispiel #9
0
int main ( int argc, char *argv[] )

/******************************************************************************/
/*
  Purpose:

    HELLO has each thread print out its ID.

  Discussion:

    HELLO is a "Hello, World" program for OpenMP.

  Licensing:

    This code is distributed under the GNU LGPL license.

  Modified:

    23 June 2010

  Author:

    John Burkardt
*/
{
    int id;
    double wtime;

    printf ( "\n" );
    printf ( "HELLO_OPENMP\n" );
    printf ( "  C/OpenMP version\n" );

    printf ( "\n" );
    printf ( "  Number of processors available = %d\n", omp_get_num_procs ( ) );
    printf ( "  Number of threads =              %d\n", omp_get_max_threads ( ) );

    wtime = omp_get_wtime ( );

    printf ( "\n" );
    printf ( "  OUTSIDE the parallel region.\n" );
    printf ( "\n" );

    id = omp_get_thread_num ( );
    printf ( "  HELLO from process %d\n", id ) ;

    printf ( "\n" );
    printf ( "  Going INSIDE the parallel region:\n" );
    printf ( "\n" );
    /*
      INSIDE THE PARALLEL REGION, have each thread say hello.
    */
    # pragma omp parallel \
    private ( id )
    {
        id = omp_get_thread_num ( );
        printf ("  Hello from process %d\n", id );
    }
    /*
      Finish up by measuring the elapsed time.
    */
    wtime = omp_get_wtime ( ) - wtime;

    printf ( "\n" );
    printf ( "  Back OUTSIDE the parallel region.\n" );
    /*
      Terminate.
    */
    printf ( "\n" );
    printf ( "HELLO_OPENMP\n" );
    printf ( "  Normal end of execution.\n" );

    printf ( "\n" );
    printf ( "  Elapsed wall clock time = %f\n", wtime );

    return 0;
}
Beispiel #10
0
int main()
{

	SetThreads();
	PrintInfo();

	double Start = omp_get_wtime();

	double * restrict ResultPrices;
	ResultPrices = malloc(sizeof(double) * HISTORY);

#pragma offload target(mic) out(ResultPrices:length(HISTORY))
	{
		SetMICThreads();

		double * restrict Prices;
		double * restrict Epsilon;

		Prices = malloc(sizeof(double) * HISTORY);
		Epsilon = malloc(sizeof(double) * HISTORY);

		//Creating random stream
		VSLStreamStatePtr RndStream;
		vslNewStream(&RndStream, VSL_BRNG_SFMT19937, (int)time(NULL));

		long double Buff;
		for (unsigned int iter = 0; iter < TE; iter++)
		{

			//Randomize volumes
			vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, RndStream, HISTORY, Epsilon, 0, 0.002);

			#pragma omp parallel for shared(Prices, ResultPrices)
			for (unsigned long long int i = 0; i < HISTORY; i++)
			{
				//Buff = i * i * powl(10, (-21.65) - i * 4.5 * powl(10, (-10.65));
				//Prices[i] = (((i * i * powl(10, (-24.65))) - (i * 4.5 * powl(10, (-13.65))) + 1.095) + Epsilon[i]);
				Prices[i] = (  (  i * i * powl(10, (-24.65)) - i * 4.5 * powl(10, (-13.65)) + 1.095 ) + Epsilon[i]);
				ResultPrices[i] += Prices[i];
			}

		}

		#pragma omp parallel for shared(ResultPrices)
		for (unsigned long long int j = 0; j < HISTORY; j++)
		{
			ResultPrices[j] = ResultPrices[j] / TE;;
		}


		free(Prices);
		free(Epsilon);
		Prices = NULL;
		Epsilon = NULL;
	}

	double End = omp_get_wtime();
	printf("%lf\n", (End - Start));
	FILE *FpResultHistory;
//unsigned long long int Buff;
	FpResultHistory = fopen("res_history.txt", "wb");
	if (FpResultHistory)
	{
		printf("//================================================================\n");
		printf("||	Result history file status : open\n");
		for (unsigned long long int i = 0; i < HISTORY; i++)
		{
			//Buff = (i);
			fprintf(FpResultHistory, "%llu %lf\n", (i * 10), ResultPrices[i]);
			//fprintf(fp_result, "%lf %lf %lf\n", ResultPrices[i], ResultVolumeUp[i], ResultVolumeDown[i]);
		}
		fclose(FpResultHistory);
		printf("||	Result history file status : close\n||\n");
		printf("\\================================================================\n\n");
	}

	free(ResultPrices);
	ResultPrices = NULL;

	return 0;
}
Beispiel #11
0
std::vector<GraspHypothesis> HandSearch::findHands(const PointCloud::Ptr cloud,
	const Eigen::VectorXi& pts_cam_source, const std::vector<Quadric>& quadric_list,
	const Eigen::VectorXi& hands_cam_source, const pcl::KdTreeFLANN<pcl::PointXYZ>& kdtree)
{
	double t1 = omp_get_wtime();
	std::vector<int> nn_indices;
	std::vector<float> nn_dists;
	Eigen::Matrix3Xd nn_normals(3, nn_indices.size());
	Eigen::VectorXi nn_cam_source(nn_indices.size());
	Eigen::Matrix3Xd centered_neighborhood(3, nn_indices.size());
	std::vector<RotatingHand> hand_list(quadric_list.size());
//  std::vector<RotatingHand> hand_list;
	double time_eval_hand = 0.0;
	double time_iter = 0.0;
	double time_nn = 0.0;
	double time_tf = 0.0;

	std::vector< std::vector<GraspHypothesis> > grasp_lists(quadric_list.size(), std::vector<GraspHypothesis>(0));

#ifdef _OPENMP // parallelization using OpenMP
#pragma omp parallel for private(nn_indices, nn_dists, nn_normals, nn_cam_source, centered_neighborhood) num_threads(num_threads_)
#endif
	for (std::size_t i = 0; i < quadric_list.size(); i++)
	{
		double timei = omp_get_wtime();
		pcl::PointXYZ sample;
		sample.x = quadric_list[i].getSample()(0);
		sample.y = quadric_list[i].getSample()(1);
		sample.z = quadric_list[i].getSample()(2);
//    std::cout << "i: " << i << ", sample: " << sample << std::endl;

		if (kdtree.radiusSearch(sample, nn_radius_hands_, nn_indices, nn_dists) > 0)
		{
			time_nn += omp_get_wtime() - timei;
			nn_normals.setZero(3, nn_indices.size());
			nn_cam_source.setZero(nn_indices.size());
			centered_neighborhood.setZero(3, nn_indices.size());

			for (int j = 0; j < nn_indices.size(); j++)
			{
				nn_cam_source(j) = pts_cam_source(nn_indices[j]);
				centered_neighborhood.col(j) = (cloud->points[nn_indices[j]].getVector3fMap()
						- sample.getVector3fMap()).cast<double>();
				nn_normals.col(j) = cloud_normals_.col(nn_indices[j]);
			}

			FingerHand finger_hand(finger_width_, hand_outer_diameter_, hand_depth_);

			Eigen::Vector3d sample_eig = sample.getVector3fMap().cast<double>();
			RotatingHand rotating_hand(cam_tf_left_.block<3, 1>(0, 3) - sample_eig,
				cam_tf_right_.block<3, 1>(0, 3) - sample_eig, finger_hand, tolerant_antipodal_, hands_cam_source(i));
			const Quadric& q = quadric_list[i];
			double time_tf1 = omp_get_wtime();
			rotating_hand.transformPoints(centered_neighborhood, q.getNormal(), q.getCurvatureAxis(), nn_normals,
				nn_cam_source, hand_height_);
			time_tf += omp_get_wtime() - time_tf1;
			double time_eval1 = omp_get_wtime();
			std::vector<GraspHypothesis> grasps = rotating_hand.evaluateHand(init_bite_, sample_eig, true);
			time_eval_hand += omp_get_wtime() - time_eval1;

			if (grasps.size() > 0)
			{
				// grasp_list.insert(grasp_list.end(), grasps.begin(), grasps.end());
        grasp_lists[i] = grasps;
			}
		}

		time_iter += omp_get_wtime() - timei;
	}
	time_eval_hand /= quadric_list.size();
	time_nn /= quadric_list.size();
	time_iter /= quadric_list.size();
	time_tf /= quadric_list.size();
	//std::cout << " avg time for transforming point neighborhood: " << time_tf << " sec.\n";
	//std::cout << " avg time for NN search: " << time_nn << " sec.\n";
	//std::cout << " avg time for rotating_hand.evaluate(): " << time_eval_hand << " sec.\n";
	//std::cout << " avg time per iteration: " << time_iter << " sec.\n";
  
  std::vector<GraspHypothesis> grasp_list;
  for (std::size_t i = 0; i < grasp_lists.size(); i++)
  {
    // std::cout << i << " " << grasp_lists[i].size() << "\n";
    if (grasp_lists[i].size() > 0)
      grasp_list.insert(grasp_list.end(), grasp_lists[i].begin(), grasp_lists[i].end());
  }

	double t2 = omp_get_wtime();
	//std::cout << " Found " << grasp_list.size() << " robot hand poses in " << t2 - t1 << " sec.\n";

	return grasp_list;
}
Beispiel #12
0
/** call: ./main <matrix_dimension> <number_of_tests> <use_gpu>*/
int main(int argc, char* argv[])
{
	cuda_identify();

	if (argc != 4) {
		printf("program must be called with arguments: matrix_dimension tests_number use_gpu(0/1)\n");
		exit(1);
	}
	const int M = atoi(argv[1]);
	printf("Using matrix dimension: %d\n", M);
	const int tests = atoi(argv[2]);
	const bool cpu = !atoi(argv[3]);

	// always use the same seed to get the same matrices during tests
	srand(0);

	#ifdef DOUBLE
		const fp_t min_diff = 0.00000001;	//for double, fails with 8192 and floats on both cpu and gpu
	#else
		const fp_t min_diff = 0.000001;
	#endif
	const fp_t alpha = 0.9;
	const int max_iter = 50;

	fp_t* exec_times = malloc(tests * sizeof(fp_t));
	fp_t* all_rmse = malloc(tests * sizeof(fp_t));
	for (int k = 0; k < tests; k++) {

		const DataSet dataset = generate_dataset(M);

		Matrix* last_x = aligned_vector(M, true);
		Matrix* x = aligned_vector(M, true);
		for (int i = 0; i < M; i++) {
		}

		int iterations = 0;

		// solve Ax = b
		const fp_t start_time = omp_get_wtime();

		fp_t sum = 0;
		int j = 0;
		int i = 0;
		const Matrix* A = dataset.A;
		const Matrix* b = dataset.b;
		assert(x != last_x);

		if (cpu) {
			//#pragma omp parallel shared(last_x, x, iterations) private(i, j, sum)
			while ((matrix_diff(x, last_x) > min_diff) && (max_iter < 0 || iterations < max_iter)) {
				//fp_t st_time0 = omp_get_wtime();
				//#pragma omp single
				{
					swap(last_x, x);
				}

				// A, M, alpha and b are constant, so they cannot be declared as shared
				//#pragma omp for schedule(dynamic)
				for (i = 0; i < M; i++) {
					sum = 0;

					//#pragma omp simd aligned(A, last_x: 16) reduction(+:sum) linear(j)
					for (j = 0; j < M; j++) {
						sum += A->elements[i * M + j] * last_x->elements[j];
					}

					sum -= A->elements[i * M + i] * last_x->elements[i];	// opt: outside the loop for sse optimizer
					x->elements[i] = (1 - alpha) * last_x->elements[i] + alpha * (b->elements[i] - sum) / A->elements[i * M + i];
				}

				//#pragma omp single nowait
				{
					iterations++;
				}
				//printf("%dus spent\n", (int)((omp_get_wtime() - st_time0) * 1000000));
			}
		} else {
			Matrix* d_A = device_matrix_from(A);
			#ifndef DOUBLE
				#ifdef TEXTURE
					texbind(d_A->elements, d_A->size * sizeof(fp_t));
				#endif
			#endif
			cudaMemcpy(d_A->elements, A->elements, A->size * sizeof(fp_t), cudaMemcpyHostToDevice);

			Matrix* d_b = device_matrix_from(b);
			cudaMemcpy(d_b->elements, b->elements, b->size * sizeof(fp_t), cudaMemcpyHostToDevice);

			Matrix* d_last_x = device_matrix_from(last_x);
			Matrix* d_c = device_matrix_from(b);
			Matrix* d_x = device_matrix_from(x);
			cudaMemcpy(d_x->elements, x->elements, x->size * sizeof(fp_t), cudaMemcpyHostToDevice);
			cudaMemcpy(d_last_x->elements, last_x->elements, last_x->size * sizeof(fp_t), cudaMemcpyHostToDevice);

			fp_t x_diff = 2 * min_diff;
			fp_t* d_x_diff;
			cudaMalloc((void**)&d_x_diff, sizeof(fp_t));

			//fp_t stime;
			while ((x_diff > min_diff) && (max_iter < 0 || iterations < max_iter)) {
				//stime = omp_get_wtime();
				cuda_multiply(*d_A, *d_last_x, *d_c);
				//print_cuda_elapsed(stime);

				//stime = omp_get_wtime();
				cuda_reduce(*d_A, *d_b, *d_c, d_x, d_last_x, alpha); //performs swap
				//print_cuda_elapsed(stime);

				//stime = omp_get_wtime();
				cuda_diff(*d_x, *d_last_x, d_x_diff);
				//print_cuda_elapsed(stime);

				iterations++;
				//cudaMemcpyFromSymbol(&x_diff, "d_x_diff", sizeof(x_diff), 0, cudaMemcpyDeviceToHost);
				//stime = omp_get_wtime();
				cudaMemcpy(&x_diff, d_x_diff, sizeof(fp_t), cudaMemcpyDeviceToHost);
				//print_cuda_elapsed(stime);
			}
			// copy last_x instead, as it was swapped
			cudaMemcpy(x->elements, d_last_x->elements, x->size * sizeof(fp_t), cudaMemcpyDeviceToHost);

			#ifndef DOUBLE
				#ifdef TEXTURE
					texunbind();
				#endif
			#endif
			cudaFree(d_A->elements);
			cudaFree(d_b->elements);
			cudaFree(d_last_x->elements);
			cudaFree(d_c->elements);
			cudaFree(d_x->elements);
			cudaFree(d_x_diff);

			free(d_A);
			free(d_b);
			free(d_c);
			free(d_last_x);
			free(d_x);
		}
		const fp_t end_time = omp_get_wtime();
		const fp_t seconds_spent = end_time - start_time;
		exec_times[k] = seconds_spent;

		if (verbose) {
			printf("x: ");
			print_matrix(x);
			printf("expected_x: ");
			print_matrix(dataset.x);
			//print_matrix(dataset.A);
			//print_matrix(dataset.b);
		}
		Matrix* bx = aligned_vector(M, false);
		for (int i = 0; i < M; i++) {
			for (int j = 0; j < M; j++) {
				bx->elements[i] += A->elements[i * M + j] * x->elements[j];
			}
		}
		if (verbose) {
			printf("resulting b: ");
			print_matrix(bx);
		}
		all_rmse[k] = rmse(bx, b);
		printf("RMSE: %0.10f\n", all_rmse[k]);
		printf("iterations: %d\nseconds: %0.10f\n", iterations, seconds_spent);

		assert(x != last_x);

		free(bx->elements);
		free(x->elements);
		free(last_x->elements);
		free(dataset.x->elements);
		free(dataset.A->elements);
		free(dataset.b->elements);

		free(bx);
		free(x);
		free(last_x);
		free(dataset.x);
		free(dataset.A);
		free(dataset.b);
	}
	printf("Time: mean %0.10f std %0.10f\n", array_mean(exec_times, tests), array_std(exec_times, tests));
	printf("RMSE: mean %0.10f std %0.10f\n", array_mean(all_rmse, tests), array_std(all_rmse, tests));
	free(all_rmse);
	free(exec_times);

	return 0;
}
Beispiel #13
0
void print_cuda_elapsed(fp_t start_time)
{
	cudaDeviceSynchronize();
	printf("%dms spent\n", (int)((omp_get_wtime() - start_time) * 1000));
}
Beispiel #14
0
void SilhouetteExtractor::computeVisibleFrontFacingStatus()
{
    int terrain_width = terrain_->width();
    int terrain_height = terrain_->height();

    delete front_facing_;
    front_facing_ = new FacingMode [(terrain_width-1)*(terrain_height-1)];

    bool use_intersections = true ;

    if (!use_intersections)
    {
        setupPixelBuffer();
        pixelbuffer_->makeCurrent();
        glEnable(GL_DEPTH_TEST);
        glEnable(GL_LIGHT0);
        glEnable(GL_LIGHTING);
        GLfloat lightpos[] = {.5, 1., 1., 0.};
        glLightfv(GL_LIGHT0, GL_POSITION, lightpos);

        glClearColor(1.0f, 1.0f, 1.0f, 1.0f);
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
        drawTerrain();
        updateMatrices();

        saveBuffer("test_dpth.png");
        pixelbuffer_->makeCurrent();
    }

    double begin = omp_get_wtime();

    int i = 0;
#pragma omp parallel for private(i)
    for (int j = 0; j < terrain_->height()-1; ++j)
        for (i = 0; i < terrain_->width()-1; ++i)
        {
            front_facing_[j*(terrain_width-1)+i] = kInvisible;

            Eigen::Vector3f center = getFaceCentroid(i, j);
            Eigen::Vector3f projector = center - camera_info_.position;
            //projector = camera_info_.direction;

            float theta = acos(camera_info_.direction.normalized().dot(projector.normalized()));
            if (theta > camera_info_.fov_in_rads/2)
                continue;

            front_facing_[j*(terrain_width-1)+i] = kBackFacing;

            if (terrain_->getGridNormal(i, j).dot(projector) <= -FLT_EPSILON)
            {
                if (use_intersections)
                {
                    if (checkVisibility(center))
                        front_facing_[j*(terrain_width-1)+i] = kFrontFacing;
                } else
                {
                    Eigen::Vector3d window_coords;
                    gluProject(center[0], center[1], center[2],
                               modelview_matrix_, projection_matrix_, viewport_,
                               &window_coords[0], &window_coords[1], &window_coords[2]);

                    if (window_coords[0] < 0 || window_coords[1] < 0 || window_coords[0] >= width() || window_coords[1] >= height())
                        continue;
                    float depth = 0.0;
                    glReadPixels(window_coords[0], window_coords[1], 1, 1, GL_DEPTH_COMPONENT, GL_FLOAT, &depth);

                    if (std::abs(depth-window_coords[2]) < 1e-3)
                        front_facing_[j*(terrain_width-1)+i] = kFrontFacing;
                }
            }
        }

    double end = omp_get_wtime();
    double elapsed_secs = double(end - begin);
    fprintf(stdout, "Elapsed time for checking front/back facing: %.2f secs\n", elapsed_secs);
    fprintf(stdout, "Num of threads: %d threads\n", omp_get_thread_num());
    fflush(stdout);

    if (pixelbuffer_)
    {
        pixelbuffer_->doneCurrent();
        cleanupPixelBuffer();
    }
}
Beispiel #15
0
int main(int argc, char **argv)
{
   int Ndim;           // A[Ndim][Ndim]
   int i,j, iters;
   double start_time, elapsed_time;
   TYPE conv, tmp, err, chksum;
   TYPE *A, *b, *x1, *x2, *xnew, *xold, *xtmp; 

// set matrix dimensions and allocate memory for matrices
   if(argc ==2){
      Ndim = atoi(argv[1]);
   }
   else{
      Ndim = DEF_SIZE;
   }

   printf(" jacobi solver parallel for version: ndim = %d\n",Ndim);

   A    = (TYPE *) malloc(Ndim*Ndim*sizeof(TYPE));
   b    = (TYPE *) malloc(Ndim*sizeof(TYPE));
   x1   = (TYPE *) malloc(Ndim*sizeof(TYPE));
   x2   = (TYPE *) malloc(Ndim*sizeof(TYPE));

   if (!A || !b || !x1 || !x2)
   {
        printf("\n memory allocation error\n");
        exit(-1);
   }

   // generate our diagonally dominant matrix, A
   init_diag_dom_near_identity_matrix(Ndim, A);

#ifdef VERBOSE
   mm_print(Ndim, Ndim, A);
#endif

//
// Initialize x and just give b some non-zero random values
//
   for(i=0; i<Ndim; i++){
     x1[i] = (TYPE)0.0;
     x2[i] = (TYPE)0.0;
     b[i]  = (TYPE)(rand()%51)/100.0;
   }

   start_time = omp_get_wtime();
// 
// jacobi iterative solver
//
   conv  = LARGE;
   iters = 0;
   xnew  = x1;
   xold  = x2;

   {
   // note: i am comparing against the convergence sqaured.  This saves a
   // sqrt and an extra barrier.
   while((conv > TOLERANCE*TOLERANCE) && (iters<MAX_ITERS))
   {
     {
        iters++;
        conv = 0.0;
        xtmp  = xnew;   // don't copy arrays.
        xnew  = xold;   // just swap pointers.
        xold  = xtmp;
     }

     #pragma omp parallel for private(i,j)
     for (i=0; i<Ndim; i++){
         xnew[i] = (TYPE) 0.0;
         for (j=0; j<Ndim;j++){
         //    if(i!=j)
         //      xnew[i]+= A[i*Ndim + j]*xold[j];
               xnew[i]+= A[i*Ndim + j]*xold[j] * (i != j);
         }
         xnew[i] = (b[i]-xnew[i])/A[i*Ndim+i];

     }
     //  
     // test convergence
     //
     #pragma omp parallel for private(tmp) reduction(+:conv)
     for (i=0; i<Ndim; i++){
         tmp  = xnew[i]-xold[i];
         conv += tmp*tmp;
     }
#ifdef DEBUG
     printf(" conv = %f \n",(float)conv);
#endif

   }
   }
   conv = sqrt((double)conv);
   elapsed_time = omp_get_wtime() - start_time;
   printf(" Convergence = %g with %d iterations and %f seconds\n",
         (float)conv, iters, (float)elapsed_time);
   
   //
   // test answer by multiplying my computed value of x by
   // the input A matrix and comparing the result with the 
   // input b vector.
   //
   err    = (TYPE) 0.0;
   chksum = (TYPE) 0.0;

   for(i=0;i<Ndim;i++){
      xold[i] = (TYPE) 0.0;
      for(j=0; j<Ndim; j++)
         xold[i] += A[i*Ndim+j]*xnew[j];
      tmp = xold[i] - b[i];
#ifdef DEBUG
      printf(" i=%d, diff = %f,  computed b = %f, input b= %f \n",
                    i, (float)tmp, (float)xold[i], (float)b[i]);
#endif
      chksum += xnew[i];
      err += tmp*tmp;
   }
   err = sqrt((double)err);
   printf("jacobi solver: err = %f, solution checksum = %f \n",
                               (float)sqrt(err), (float)chksum);

  free(A);
  free(b);
  free(x1);
  free(x2);
}
int main()
{
    /* Creat the file to save results */
    char *varnames[NUM_VARS] = {"x_rec_all"};
    create_netcdf(FILENAME_WR, NUM_VARS, varnames);

    /* Allocate memory */
    double *x_fusion_lf_all = (double*)malloc(NUM_3DSNAPS * NUM_2DSNAPS * N_HR * N_HR * sizeof(double));
    double *x_fusion_hf_all = (double*)malloc(NUM_3DSNAPS * NUM_2DSNAPS * N_HR * N_HR * sizeof(double));
    double *x_rec_all = (double*)malloc(NUM_3DSNAPS * NUM_2DSNAPS * N_HR * N_HR * sizeof(double));

    /* read all snapshots */
    size_t start_ids[4] = {0, 0, 0, 0};
    size_t count_ids[4] = {NUM_3DSNAPS, NUM_2DSNAPS, N_HR, N_HR };
    read_netcdf(FILENAME_RD, "Uinterp_all", start_ids, count_ids, x_fusion_lf_all);
    read_netcdf(FILENAME_RD, "Udiff_all", start_ids, count_ids, x_fusion_hf_all);

    double time_all_start = omp_get_wtime();

    double *x_current_lf = (double*)malloc(N_HR * N_HR * sizeof(double));
    double *x_current_hf = (double*)malloc(N_HR * N_HR * sizeof(double));
    double *x_rec = (double*)malloc(N_HR * N_HR * sizeof(double));

    long int grid_size = N_HR * N_HR * NEIGHBOR_FULLSIZE * NEIGHBOR_FULLSIZE * SIM_FULLSIZE * SIM_FULLSIZE;
    int *gridpatches_y = (int*)malloc(grid_size * sizeof(int));
    int *gridpatches_z = (int*)malloc(grid_size * sizeof(int));
    int *acc_ids = (int*)malloc(ACC_FULLSIZE * ACC_FULLSIZE * sizeof(int));
    generate_grids(gridpatches_y, gridpatches_z, acc_ids);


    for(int snap3d_id = 0; snap3d_id < NUM_3DSNAPS; snap3d_id++)
    {
        int t_offset = snap3d_id * NUM_2DSNAPS * N_HR*N_HR;

        // put first PIV
        get_onesnap(x_fusion_hf_all, x_current_hf, t_offset + 0 * N_HR * N_HR, t_offset + 1 * N_HR * N_HR - 1);
        put_onesnap(x_rec_all, x_current_hf, t_offset + 0 * N_HR * N_HR, t_offset + 1 * N_HR * N_HR - 1);

        int block_id;
        for(block_id = 0; block_id < NUM_BLOCKS; block_id++)
        {
            double time_start = omp_get_wtime();

            int t_first = SCALE_FACTOR_TIME*block_id;
            int t_last = SCALE_FACTOR_TIME*(block_id+1);

            // Put last PIV of the block
            get_onesnap(x_fusion_hf_all, x_current_hf, t_offset + t_last * N_HR * N_HR, t_offset + (t_last + 1) * N_HR * N_HR - 1);
            put_onesnap(x_rec_all, x_current_hf, t_offset + t_last * N_HR * N_HR, t_offset + (t_last + 1) * N_HR * N_HR - 1);

            if (SCALE_FACTOR_TIME % 2)
            {
                int t_bound1 = t_first + (int)SCALE_FACTOR_TIME/2;
                int t_bound2 = t_bound1 + 1;

                propag_forward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_first, t_bound1, t_offset);
                propag_backward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_last, t_bound2, t_offset);
            }
            else
            {
                int t_mid = t_first + (int)SCALE_FACTOR_TIME/2;
                int t_bound1 = t_mid - 1;
                int t_bound2 = t_mid + 1;
                propag_forward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_first, t_bound1, t_offset);
                propag_backward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_last, t_bound2, t_offset);
                propag_2planes(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_mid, t_offset);

                printf("\n Estimated block %i (total 23) in 3D snapshot %i (total 37) in %f seconds \n", block_id, snap3d_id, (double)omp_get_wtime() - time_start);
            }
        }
    }

    // Write to file
    write_netcdf(FILENAME_WR, "x_rec_all", start_ids, count_ids, x_rec_all);

    /* free memory */
    free(x_rec); free(x_current_lf); free(x_current_hf);
    free(x_rec_all); free(x_fusion_lf_all); free(x_fusion_hf_all);
    free(gridpatches_y); free(gridpatches_z); free(acc_ids);
    printf("\n FINISH ALL COMPUTATION IN %f SECONDS \n", (double)omp_get_wtime() - time_all_start);

    return 1;
}
Beispiel #17
0
int main(int argc, char** argv)
{
    const int n = NN;
    const int m = NM;
    const int iter_max = 1000;
    
    const double tol = 1.0e-6;
    double error     = 1.0;

    int use_gpu = 1;
    
    memset(A, 0, n * m * sizeof(double));
    memset(Anew, 0, n * m * sizeof(double));
        
    for (int j = 0; j < n; j++)
    {
        A[j][0]    = 1.0;
        Anew[j][0] = 1.0;
    }
    
    printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
    
    double st = omp_get_wtime();
    int iter = 0;
    
#pragma omp target data map(to:Anew) map(A) if(use_gpu)
    while ( error > tol && iter < iter_max )
    {
        error = 0.0;

#pragma omp target teams distribute parallel for reduction(max:error) map(error) if(target:use_gpu)
        for( int j = 1; j < n-1; j++)
        {
            for( int i = 1; i < m-1; i++ )
            {
                Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
                                    + A[j-1][i] + A[j+1][i]);
                error = fmax( error, fabs(Anew[j][i] - A[j][i]));
            }
        }
        
#pragma omp target teams distribute parallel for if(target:use_gpu)
        for( int j = 1; j < n-1; j++)
        {
            for( int i = 1; i < m-1; i++ )
            {
                A[j][i] = Anew[j][i];    
            }
        }

        if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }

    double et = omp_get_wtime();
 
    printf(" total: %f s\n", (et - st));

    return 0;
}
/* запускаем вычисления в несколько потоков
 * параметры:
 * thread_num - число потоков
 * b - правое краевое условие
 * arg - строковое предтавление числа b
 */
double run(int threads_num, double b, char* str_b)
{
    int i, j, step,k;
    double* y = calloc(N + 1, sizeof(double)); /* сеточное решение */
    double* dy = calloc(N + 1, sizeof(double)); /* разность y^n-y^n+1 двух соседних приближений по итерациям метода Ньютона */
    double *A[R], *B[R], *C[R], *G[R]; /* коэффициенты трёхдиагональной системы для каждого шага редукции */
    double begin, end;
    omp_set_dynamic(0); /* нельзя динамически изменять количество нитей */
    omp_set_num_threads(threads_num); /* 4 нити */
    for(i = 0; i < R; i++)
    {
        A[i] = calloc(N + 1, sizeof(double));
        B[i] = calloc(N + 1, sizeof(double));
        C[i] = calloc(N + 1, sizeof(double));
        G[i] = calloc(N + 1, sizeof(double));
    }
    begin = omp_get_wtime(); /* начальная точка отсчёта времени */
    for( k = 0; k < REPEATS; k++){
        #pragma omp parallel private(i, j)
        {
                #pragma omp for
                for(i = 0; i <= N; i++) y[i] = 1.0 + (b - 1.0) * i / N; /* нулевое приближение */
                #pragma omp single
                {
                    dy[0] = dy[N] = 0.0;
                    for(j = 0; j < R; j++) B[j][0] = B[j][N] = 1.0; /* при редукции крайние значения матрицы одни и те же во всех итерациях метода Ньютона */
                }
                while(1) /* итерации метода Ньютона в цикле */
                {
                    #pragma omp for
                    for(i = 1; i < N; i++) /* изначальные значения коэффициентов */
                    {
                        B[0][i] = (-2.0 / (h * h) - 5 * exp(y[i]) / 6);
                        A[0][i] = (1.0 / (h * h) - exp(y[i - 1]) / 12);
                        C[0][i] = (1.0 / (h * h) - exp(y[i + 1]) / 12);
                        G[0][i] = F(y, b, i);
                    }
                    for(j = 1; j < R; j++) /* значения коэффициентов после редукции */
                    {
                        step = pow(2, j); /* шаг прогонки при редукции */
                        #pragma omp for
                        for(i = step; i < N; i += step)
                        {
                            B[j][i] = B[j - 1][i] - A[j - 1][i] * C[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * A[j - 1][i + step / 2] / B[j - 1][i + step / 2];
                            A[j][i] = - A[j - 1][i] * A[j - 1][i - step / 2] / B[j - 1][i - step / 2];
                            C[j][i] = - C[j - 1][i] * C[j - 1][i + step / 2] / B[j - 1][i + step / 2];
                            G[j][i] = G[j - 1][i] - A[j - 1][i] * G[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * G[j - 1][i + step / 2] / B[j - 1][i + step / 2];
                        }
                    } /* редукция прогонки завершена */
                    #pragma omp single
                    {
                        dy[N / 2] = G[R - 1][N / 2] / B[R - 1][N / 2]; /* первый обратный шаг редукции */
                        dy[N / 4] = (G[R - 2][N / 4] - C[R - 2][N / 4] * dy[N / 2]) / B[R - 2][N / 4];
                        dy[N * 3 / 4] = (G[R - 2][N * 3 / 4] - A[R - 2][N * 3 / 4] * dy[N / 2] ) / B[R - 2][N * 3 / 4]; /* второй обратный шаг редукции */
                    }
                    for(j = R - 3; j >= 0; j--)
                    {
                        step = pow(2, j);
                        #pragma omp for
                        for(i = step; i < N; i += 2 * step) dy[i] = (G[j][i] - C[j][i] * dy[i + step] - A[j][i] * dy[i - step]) / B[j][i];
                    } /* оставшиеся обратные шаги редукции */
                    #pragma omp for
                    for(i = 0; i <= N; i++) y[i] -= dy[i]; /* одна итерация метода Ньютона */
                    if (norm(dy) < epsilon) break; /* условие останова метода Ньютона */
                }
        }
    }
    end = omp_get_wtime(); /* конечная точка отсчёта времени */
                    for(i = 0; i < R; i++)
                    {
                        free(A[i]);
                        free(B[i]);
                        free(C[i]);
                        free(G[i]);
                    }
    char str_dest[50];
    FILE* fp = fopen(strcat(strcpy(str_dest, str_b), "par_result.txt"), "w"); /* вывод полученной функции в файл */
    fprintf(fp, "X\tY\r\n");
    for(i = 0; i <= N; i++) fprintf(fp, "%e\t%e\r\n", ((double) i / N), y[i]);
    fclose(fp);

    free(y);
    free(dy);
    return (end - begin)/REPEATS;
}
 /**
 * Main function
 **/
 int main() {
     srand (time(NULL));
     /*
     Tree x;
     x.set(0,4); x.set(1,1); x.set(2,9); x.set(3,2); x.set(4,14);
     x.set(5,8); x.set(6,13); x.set(7,0); x.set(8,3); x.set(9,12);
     x.set(10,10); x.set(11,5); x.set(12,7); x.set(13,6); x.set(14,11);
     //getMax at 1 -> 14
     std::cout << x.getMax(0) << std::endl;
     std::cout << x.getMin(0) << std::endl;
     std::cout << x.fitness(0) << std::endl;
     std::cout << x.fitness(3) << std::endl;
     */
     mc=0;
     std::array<Tree, 200> population;
     
     //init population
     for(int i=0; i<100; i++) {
         Tree x;
         x.init();
         population[i] = x;
     }
     
     int count = 0;
     MUTATION_RATE=5;
     
     //alter population
     double start = omp_get_wtime();
     while(population[0].fitness(0) > 0.000) {
         count++;
         std::list<std::pair<int,double>> fitnesses;
         
         //create offspring
         for(int j=0; j<100; j++) {
             int p1 = rand() % 100;
             int p2 = rand() % 100;
             Tree kid = population[p1].combine(population[p2]);
             kid.mutate();
             population[100+j] = kid;
         }
         
         //calc fitness
         for(int k=0; k<200; k++) {
             auto x = std::make_pair(k, population[k].fitness(0));
             fitnesses.push_back(x);
         }
         
         //sort by value
         fitnesses.sort(sort_pred());
         
         //remove old population
         std::array<Tree, 100> newpop;
         for(int j=0; j<100; j++) {
             int newPos = fitnesses.back().first;
             fitnesses.pop_back();
             newpop[j] = population[newPos];
         }
         
         //clear old population and take the new ones
         if(count%10000==0) {
             double end = omp_get_wtime();
             std::cout << "Iteration count: " << count << " in " << (end-start) <<  std::endl;
             std::cout << "Best fitness: " << population[0].fitness(0) << std::endl;
             std::cout << "Weakest fitness: " << population[199].fitness(0) << std::endl;
             std::cout << "Mutation rate: " << MUTATION_RATE << std::endl;
             std::cout << "Mutation count: " << mc << std::endl;
             population[0].print();
             std::cout << std::endl;
             start = omp_get_wtime();
         }
         for(int i=0; i<100; i++) {
             population[i] = newpop[i];
         }
     }
     
     std::cout << "found solution with " << count << " iterations.n";
     population[0].print();
     return 0;
 }
Beispiel #20
0
int main(int argc, char **argv){

  bool error = false;

  //get NSUB, threads, tasks and trails from argument
  if(argc != 4){
    error = true;
  } else if((NSUB = atoi(argv[1])) == 0) {
    printf("Invalid subdivison size.\n");
    error = true;
  } else if ((NL = atoi(argv[2])) == 0){
      printf("Invalid base function degree.\n");
      error = true;
  } else if ((THREADS = atoi(argv[3])) == 0){
    printf("Invalid number of threads.\n");
    error = true;
  }

  if(error){
    printf("Usage: mpirun -np [TASKS] new [SUB_SIZE] [NL] [NUM_THREADS]\n");
    exit(EXIT_FAILURE);
  }

    if((fp_out = fopen("new_out.txt", "a")) == NULL || 
        (fp_sol = fopen("new_sol.txt", "a")) == NULL){
            printf("New Version files not found.\n");
            exit(EXIT_FAILURE);
    }

    //Allocate array memory
    adiag = (double *)malloc(sizeof(double)*(double)(NSUB+1));
    aleft = (double *)malloc(sizeof(double)*(double)(NSUB+1));
    arite = (double *)malloc(sizeof(double)*(double)(NSUB+1));
    f = (double *)malloc(sizeof(double)*(double)(NSUB+1));
    h = (double *)malloc(sizeof(double)*(double)(NSUB));
    indx = (int *)malloc(sizeof(int)*(int)(NSUB+1));
    node = (int *)malloc(sizeof(int)*((int)NL*(int)NSUB));
    xn = (double *)malloc(sizeof(double)*(double)(NSUB+1));
    xquad = (double *)malloc(sizeof(double)*(double)(NSUB));

    //START TIMER//
    double begin, end, time_spent;
    begin = omp_get_wtime();

    //set number of threads
    omp_set_num_threads(THREADS);

    /****************** MPI Initialisations ***************/
    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
    if(provided != MPI_THREAD_FUNNELED){
      return 1;
    }
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    /* set up block sizes for MPI work */
    slaveSize1 = (NSUB+1) / numprocs;
    masterSize1 = slaveSize1 + ((NSUB+1) % numprocs);
    slaveSize2 = NSUB / numprocs;
    masterSize2 = slaveSize2 + (NSUB % numprocs);

    printf("MPI: Process %d of %d\n", rank, numprocs);

    /*  If we are the master process
        Master coordinates the slaves */
    if (rank == MASTER){
      printf("MASTER: Number of processes is: %d\n",numprocs);


      timestamp ();

      fprintf (fp_out, "\n" );
      fprintf (fp_out, "FEM1D\n" );
      fprintf (fp_out, "  C version\n" );
      fprintf (fp_out, "\n" );
      fprintf (fp_out, "  Solve the two-point boundary value problem\n" );
      fprintf (fp_out, "\n" );
      fprintf (fp_out, "  - d/dX (P dU/dX) + Q U  =  F\n" );
      fprintf (fp_out, "\n" );
      fprintf (fp_out, "  on the interval [XL,XR], specifying\n" );
      fprintf (fp_out,"  the value of U or U' at each end.\n" );
      fprintf (fp_out, "\n" );
      fprintf (fp_out,"  The interval [XL,XR] is broken into NSUB = %ld subintervals\n", NSUB );
      fprintf (fp_out, "  Number of basis functions per element is NL = %ld\n", NL );
    }

    //Initialize the data.
    init ();

    //Compute the geometric quantities.
    geometry ();
    
    //Assemble the linear system.
    assemble ();

    if(rank == MASTER){
      //Print out the linear system.
      prsys ();

      //Solve the linear system.
      solve ();

      //Print out the solution.
      output ();
    }

    //Terminate.
    fprintf (fp_out, "\n" );
    fprintf (fp_out,"FEM1D:\n" );
    fprintf (fp_out, "  Normal end of execution.\n" );

    fprintf ( fp_out,"\n" );

    //END TIMER//
    end = omp_get_wtime();
    time_spent = end - begin;
    timestamp ( );

    //CLOSE STREAMS
    fclose(fp_out);
    fclose(fp_sol);

    //FREE MEMORY
    free(adiag); 
    free(aleft);
    free(arite); 
    free(f); 
    free(h); 
    free(indx); 
    free(node); 
    free(xn); 
    free(xquad);


  MPI_Finalize();

  if(rank == MASTER){
    FILE *fp_time = fopen("times.txt","a");
    fprintf(fp_time, "%f\n", time_spent);
  }

  return 0;
}
Beispiel #21
0
triangulation triangulate_cube(data_list * data,  char * tmp_triang_file, char * tmp_data_file) {
	printf("%s %s\n", tmp_triang_file, tmp_data_file);
  triangulation result = triangulation_init(data_list_dim(data));

  cube_points cube = gen_cube_points(result.dim);
  facet_acute_data parameters;
  parameters.cube = &cube;
  parameters.boundary_func = &triangle_boundary_cube;
  parameters.data = data;
  parameters.store_acute_ind = 1;
  parameters.acute_ind = malloc(sizeof(unsigned short) * cube.len);

  //This list holds all conform tetrahedrons for a given triangle, max size = cube.len
  ptetra tet_list = malloc(sizeof(tetra) * cube.len);
  unsigned short tet_list_len = 0;

  //Lists needed for the dynamic_remove loop
  tri_list check_list, check_list_new;

  check_list     = tri_list_init(result.dim, MEM_LIST_FALSE);
  check_list_new = tri_list_init(result.dim, MEM_LIST_FALSE);

  //Start triangle (0,0,0), (rand,0,0), (rand,rand,0)
  result.bound_len = 1;
  result.bound_tri = triangulation_start_facet(data);
  printf("Starting triangulation with facet:\n");
  print_triangle(result.bound_tri);
  /*
   * During this method we are going to operate data that is not thread-safe.
   * To avoid race conditions we need an array of locks. We use a lock for the
   * first two points of a triangle (so need 2d array of locks).
   */
  omp_lock_t ** locks = malloc(sizeof(omp_lock_t *) * cube.len);
  //Initalize the locks
  for (size_t i = 0; i < cube.len; i++){
    locks[i] = malloc(sizeof(omp_lock_t) * (cube.len - i));
    for (size_t j = 0; j < cube.len - i; j++)
      omp_init_lock(&locks[i][j]);
  }
  //While we have triangles on the boundary..
  while (result.bound_len > 0) {
    tri_list_empty(&check_list);
    tri_list_empty(&check_list_new);
    /*
     * We are going to add a tetrahedron on the boundary triangle.
     * To do so, we select a random triangle on the boundary. Then we generate all the
     * acute tetrahedra (above and below) with facets in our possible list.
     * From this list we remove all the tetrahedrons that intersect with our current triangulation.
     * Then we add a random tetrahedron to our triangulation, update the conform list and repeat.
     */
    int rand_bound = rand() % result.bound_len;
    printf("\n\nTotal amount of triangles left:%zu\nExpanding triangulation at boundary triangle: \n", data_list_count(data));
    print_triangle(result.bound_tri + rand_bound);

    //Calculate the conform tetrahedrons above and below
    if (!facet_conform(&result.bound_tri[rand_bound], &parameters))
    {
      printf("We have a triangle on the boundary that is not conform anymore.\n");
      printf("Whatthefuck? Breaking!\n");
      break;
    }

    tet_list_len = parameters.acute_ind_len;
    printf("Total amount of conform tetrahedrons found for this boundary: %hu\n", tet_list_len);
    //Form explicit list of the tetrahedrons
    for (unsigned short i = 0; i < tet_list_len; i++) 
    {
      copyArr3(tet_list[i].vertices[0], result.bound_tri[rand_bound].vertices[0]);
      copyArr3(tet_list[i].vertices[1], result.bound_tri[rand_bound].vertices[1]);
      copyArr3(tet_list[i].vertices[2], result.bound_tri[rand_bound].vertices[2]);
      copyArr3(tet_list[i].vertices[3], cube.points[parameters.acute_ind[i]]);
    }

    //Remove all the tetrahedrons that intersect with current triangulation.
    filter_tet_list_disjoint_triangulation(tet_list, &tet_list_len, &result);

    printf("Amount of tetrahedrons left after filtering: %hu\n\n",tet_list_len);
    if (tet_list_len == 0) {
      printf("Waarom is deze lijst nu al fucking leeggefilterd?\n");
      printf("Dead end, helaas pindakaas. Got to %zu\n", result.tetra_len);
      break;
    }

    //Select random tetrahedron disjoint with the current triangulation
    int rand_tet = rand() % tet_list_len;
    /*
     * Add the above tetra to the triangulation.
     * This removes all the boundary triangles that are covered by this tetrahedron
     */
    printf("Adding the following tetra to the triangulation\n");
    print_tetra(tet_list + rand_tet);
    printf("\n\n");
    add_tet_triangulation(tet_list + rand_tet, &result);
    triangulation_print(&result);

    if (!result.bound_len) //If we have no boundaries left, we must be done!!
    {
      printf("No more boundaries left.. WE FINNISHED!??\n");
      break;
    }
    //Consistency check
    if (!triangulation_consistent(&result, &parameters))
    {
      printf("Triangulation not consistent after adding the tetrahedron. Breaking.\n");
      break;
    }
    /*
     * Calculate a list of all the triangles we are going to remove
     */
    double time_removed = omp_get_wtime();
    printf("Removing triangles not disjoint with new tetrahedron\n");
    size_t removed = filter_intersection_data_list_tet(data,  &check_list, tet_list + rand_tet, locks);
    printf("Removed %zu triangles that are not disjoint with the new tetrahedron\n", removed);
    printf("The check_list has size %zu\n", tri_list_count(&check_list));
    printf("Time took to removed triangles: %g seconds\n", omp_get_wtime()-time_removed);

    if (!triangulation_consistent(&result, &parameters)) {
      printf("After filtering the memory list we have a non consistent triangulation. Break\n");
      break;
    }
    //Do two iterations
    facets_conform_dynamic_remove(data, &result, 1, &check_list, &check_list_new, locks);

    if (!triangulation_consistent(&result, &parameters)) {
      printf("Triangulation not consistent anymore after conforming the data set.. Breaking\n");
      break;
    }

    /*mem_list_cube_compress(&data->mem_list);


      if (tmp_triang_file && tmp_data_file) {
      triangulation_to_file(&result, tmp_triang_file);
      data_list_to_file(data, tmp_data_file, MEM_LIST_SAVE_CLEAN);
      }
      */
  }
  for (size_t i = 0; i < cube.len; i++){
    for (size_t j = 0; j < cube.len - i; j++)
      omp_destroy_lock(&locks[i][j]);
    free(locks[i]);
  }

  free(locks);
  free(cube.points);
  free(parameters.acute_ind);
  free(tet_list);
  tri_list_free(&check_list);
  tri_list_free(&check_list_new);
  printf("Triangulation has length of %zu\n", result.tetra_len);
  return result;
}
int main()
{
int **a,**b,**c;    // Variable for saving memory allocation
int a_r,a_c,b_r,b_c, nthreads, tid, chunk =10;  //Variables for Matrix dimensions and for OpenMP functions
double dif;       //For time difference calculation
int i,j,k;
again:
printf("Enter number of Rows & Columns for Matrix 1: \n");
scanf("%d%d",&a_r,&a_c);
printf("Enter number of Rows & Columns for Matrix 2: \n");
scanf("%d%d",&b_r,&b_c);
if(a_c!=b_r )
 {
     printf("\ncan not multiply");
     goto again;
 }
// Memory allocation for Matrix 1
a=(int **) malloc(10*a_r);
for( i=0;i<a_c; i++)
 {
  a[i]=(int *) malloc(10*a_c);
 }
// Memory allocation for Matrix 2
b=(int **) malloc(10*b_r);
for( i=0;i<b_c; i++)
 {
  b[i]=(int *) malloc(10*b_c);
 }
// Memory allocation for Product Matrix
c=(int **) malloc(10*a_r);
for( i=0;i< b_c; i++)
 {
  c[i]=(int *) malloc(10*b_c);
 }
printf("Matrix default Initialization\n");
printf("Clock Started\n");
//Setting up Clock
double start = omp_get_wtime( );
                                                                        //Code Parallelization Initiated
    #pragma omp parallel shared(a,b,c,nthreads,chunk) private(tid,i,j,k)
     {
       tid = omp_get_thread_num();
       if (tid == 0)
       {
         nthreads = omp_get_num_threads();
         printf("Starting matrix multiple example with %d threads\n",nthreads);
        }
                                                                      //Initialization of Matrix 1
       #pragma omp for schedule (static, chunk)
        for(i=0;i<a_r; i++)
        {
          for(j=0;j<a_c; j++)
           {
            a[i][j] = i+j;
            }
         }
                                                                    //Initialization of Matrix 2
      #pragma omp for schedule (static, chunk)
       for(i=0;i<b_r; i++)
       {
        for(j=0;j<b_c; j++)
         {
           b[i][j] = i*j;
          }
        }
                                                                    //Initialization of Product Matrix 3
      #pragma omp for schedule (static, chunk)
       for(i=0;i<a_r; i++)
        {
         for(j=0;j< b_c; j++)
          {
           c[i][j]=0;
           }
         }
                                                                   //Matrix Multiplication
      printf("Thread %d starting Matrix Multiply\n",tid);
      #pragma omp for schedule (static, chunk)
       for(i=0; i<a_r; i++)
        {
         printf("Thread %d performed row = %d \n",tid,i);
         for(j=0;j<a_c; j++)
          {
           for(k=0;k<b_c; k++)
            {
              c[i][j]=c[i][j]+a[i][k]*b[k][j];
             }
           }
         }
    }

    printf("Matrix Multiplication Done\n");
//Stop Timer
    double end = omp_get_wtime( );
    dif = end - start;
    printf("Parallelization took %f Seconds\n", dif);
// Releasing Memory
  for(i=0;i<a_r; i++)
   {
    free(a[i]);
    }
  free(a);
  for(i=0;i<a_c; i++)
   {
    free(b[i]);
    }
  free(b);
  for(i=0;i<b_c; i++)
   {
    free(c[i]);
    }
  free(c);
}
Beispiel #23
0
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) {
    int ret_code = 0;
    double start, stop, total;
    
    vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches);
    if (!vcf_file) {
        LOG_FATAL("VCF file does not exist!\n");
    }
    
    ped_file_t *ped_file = NULL;
    if (shared_options_data->ped_filename) {
        ped_file = ped_open(shared_options_data->ped_filename);
        if (!ped_file) {
            LOG_FATAL("PED file does not exist!\n");
        }
        LOG_INFO("About to read PED file...\n");
        // Read PED file before doing any processing
        ret_code = ped_read(ped_file);
        if (ret_code != 0) {
            LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename);
        }
    }
    
    char *output_directory = shared_options_data->output_directory;
    size_t output_directory_len = strlen(output_directory);
    
    ret_code = create_directory(output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", output_directory);
    }
    
    // Remove all .txt files in folder
    ret_code = delete_files_by_extension(output_directory, "txt");
    if (ret_code != 0) {
        return ret_code;
    }
    
    // Initialize environment for connecting to the web service
    ret_code = init_http_environment(0);
    if (ret_code != 0) {
        return ret_code;
    }
    
    // Output file descriptors
    static cp_hashtable *output_files = NULL;
    // Lines of the output data in the main .txt files
    static list_t *output_list = NULL;
    // Consequence type counters (for summary, must be kept between web service calls)
    static cp_hashtable *summary_count = NULL;
    // Gene list (for genes-with-variants, must be kept between web service calls)
    static cp_hashtable *gene_list = NULL;

    // Initialize collections of file descriptors and summary counters
    ret_code = initialize_output_files(output_directory, output_directory_len, &output_files);
    if (ret_code != 0) {
        return ret_code;
    }
    initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list);
    initialize_ws_buffers(shared_options_data->num_threads);
    
    // Create job.status file
    char job_status_filename[output_directory_len + 10];
    sprintf(job_status_filename, "%s/job.status", output_directory);
    FILE *job_status = new_job_status_file(job_status_filename);
    if (!job_status) {
        LOG_FATAL("Can't create job status file\n");
    } else {
        update_job_status_file(0, job_status);
    }
    
 
#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            
            start = omp_get_wtime();
            
            ret_code = vcf_read(vcf_file, 1,
                                (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines,
                                shared_options_data->batch_bytes <= 0);

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) {
                LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename);
            }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            notify_end_parsing(vcf_file);
        }
        
#pragma omp section
        {
            // Enable nested parallelism and set the number of threads the user has chosen
            omp_set_nested(1);
            
            LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num());
            
            // Filters and files for filtering output
            filter_t **filters = NULL;
            int num_filters = 0;
            if (shared_options_data->chain != NULL) {
                filters = sort_filter_chain(shared_options_data->chain, &num_filters);
            }
            FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL;
            get_filtering_output_files(shared_options_data, &passed_file, &failed_file);
            
            // Pedigree information (used in some filters)
            individual_t **individuals = NULL;
            khash_t(ids) *sample_ids = NULL;
            
            // Filename structure outdir/vcfname.errors
            char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char));
            get_filename_from_path(shared_options_data->vcf_filename, prefix_filename);
            char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char));
            sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename);
            non_processed_file = fopen(non_processed_filename, "w");
            free(non_processed_filename);
            
            // Maximum size processed by each thread (never allow more than 1000 variants per query)
            if (shared_options_data->batch_lines > 0) {
                shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, 
                            ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads));
            } else {
                shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY;
            }
            LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread);
    
            int i = 0;
            vcf_batch_t *batch = NULL;
            int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0;
            
            start = omp_get_wtime();

            while (batch = fetch_vcf_batch(vcf_file)) {
                if (i == 0) {
                    // Add headers associated to the defined filters
                    vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters);
                    for (int j = 0; j < num_filters; j++) {
                        add_vcf_header_entry(filter_headers[j], vcf_file);
                    }
                        
                    // Write file format, header entries and delimiter
                    if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); }
                    if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); }
                    if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); }
                    
                    LOG_DEBUG("VCF header written\n");
                    
                    if (ped_file) {
                        // Create map to associate the position of individuals in the list of samples defined in the VCF file
                        sample_ids = associate_samples_and_positions(vcf_file);
                        // Sort individuals in PED as defined in the VCF file
                        individuals = sort_individuals(vcf_file, ped_file);
                    }
                }
                
//                     printf("batch loaded = '%.*s'\n", 50, batch->text);
//                     printf("batch text len = %zu\n", strlen(batch->text));

//                 if (i % 10 == 0) {
                    LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", 
                            i, omp_get_thread_num(),
                            batch->records->size, batch->records->capacity);
//                 }

                int reconnections = 0;
                int max_reconnections = 3; // TODO allow to configure?

                // Write records that passed to a separate file, and query the WS with them as args
                array_list_t *failed_records = NULL;
                int num_variables = ped_file? get_num_variables(ped_file): 0;
                array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records);
                if (passed_records->size > 0) {
                    // Divide the list of passed records in ranges of size defined in config file
                    int num_chunks;
                    int *chunk_sizes;
                    int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes);
                    
                    do {
                        // OpenMP: Launch a thread for each range
                        #pragma omp parallel for num_threads(shared_options_data->num_threads)
                        for (int j = 0; j < num_chunks; j++) {
                            int tid = omp_get_thread_num();
                            LOG_DEBUG_F("[%d] WS invocation\n", tid);
                            LOG_DEBUG_F("[%d] -- effect WS\n", tid);
                            if (!reconnections || ret_ws_0) {
                                ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), 
                                                            chunk_sizes[j], options_data->excludes);
                                parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list);
                                free(effect_line[tid]);
                                effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char));
                            }
                            
                            if (!options_data->no_phenotypes) {
                                if (!reconnections || ret_ws_1) {
                                    LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num());
                                    ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]);
                                    parse_snp_phenotype_response(tid, output_list);
                                    free(snp_line[tid]);
                                    snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char));
                                }
                                 
                                if (!reconnections || ret_ws_2) {
                                    LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num());
                                    ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]);
                                    parse_mutation_phenotype_response(tid, output_list);
                                    free(mutation_line[tid]);
                                    mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char));
                                }
                            }
                        }
                        
                        LOG_DEBUG_F("*** %dth web services invocation finished\n", i);
                        
                        if (ret_ws_0 || ret_ws_1 || ret_ws_2) {
                            if (ret_ws_0) {
                                LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0));
                            }
                            if (ret_ws_1) {
                                LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1));
                            }
                            if (ret_ws_2) {
                                LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2));
                            }
                            
                            // In presence of errors, wait 4 seconds before retrying
                            reconnections++;
                            LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections);
                            sleep(4);
                        } else {
                            free(chunk_starts);
                            free(chunk_sizes);
                        }
                    } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2));
                }
                
                // If the maximum number of reconnections was reached still with errors, 
                // write the non-processed batch to the corresponding file
                if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) {
                #pragma omp critical
                    {
                        write_vcf_batch(batch, non_processed_file);
                    }
                }
                
                // Write records that passed and failed filters to separate files, and free them
                write_filtering_output_files(passed_records, failed_records, passed_file, failed_file);
                free_filtered_records(passed_records, failed_records, batch->records);
                
                // Free batch and its contents
                vcf_batch_free(batch);
                
                i++;
            }

            stop = omp_get_wtime();

            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            // Free resources
            if (passed_file) { fclose(passed_file); }
            if (failed_file) { fclose(failed_file); }
            if (non_processed_file) { fclose(non_processed_file); }
            
            // Free filters
            for (i = 0; i < num_filters; i++) {
                filter_t *filter = filters[i];
                filter->free_func(filter);
            }
            free(filters);
            
            // Decrease list writers count
            for (i = 0; i < shared_options_data->num_threads; i++) {
                list_decr_writers(output_list);
            }
        }
        
#pragma omp section
        {
            // Thread which writes the results to all_variants, summary and one file per consequence type
            int ret = 0;
            char *line;
            list_item_t* item = NULL;
            FILE *fd = NULL;
            
            FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants");
            FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes");
            FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes");
            
            while ((item = list_remove_item(output_list)) != NULL) {
                line = item->data_p;
                
                // Type greater than 0: consequence type identified by its SO code
                // Type equals to -1: SNP phenotype
                // Type equals to -2: mutation phenotype
                if (item->type > 0) {
                    // Write entry in the consequence type file
                    fd = cp_hashtable_get(output_files, &(item->type));
                    int ret = fprintf(fd, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to file: '%s'\n", line);
                    }
                    
                    // Write in all_variants
                    ret = fprintf(all_variants_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to all_variants: '%s'\n", line);
                    }
                    
                } else if (item->type == SNP_PHENOTYPE) {
                    ret = fprintf(snp_phenotype_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line);
                    }
                    
                } else if (item->type == MUTATION_PHENOTYPE) {
                    ret = fprintf(mutation_phenotype_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line);
                    }
                }
                
                free(line);
                list_item_free(item);
            }
            
        }
    }

    write_summary_file(summary_count, cp_hashtable_get(output_files, "summary"));
    write_genes_with_variants_file(gene_list, output_directory);
    write_result_file(shared_options_data, options_data, summary_count, output_directory);

    free_output_data_structures(output_files, summary_count, gene_list);
    free_ws_buffers(shared_options_data->num_threads);
    free(output_list);
    vcf_close(vcf_file);
    
    update_job_status_file(100, job_status);
    close_job_status_file(job_status);
    
    return ret_code;
}
int main ( int argc, char *argv[] )

/******************************************************************************/
/*
  Purpose:

    MAIN is the main program for SCHEDULE_OPENMP.

  Discussion:

    This program demonstrates the difference between default,
    static and dynamic scheduling for a loop parallelized in OpenMP.

    The purpose of scheduling is to deal with loops in which there is
    known or suspected imbalance in the work load.  In this example,
    if the work is divided in the default manner between two threads,
    the second thread has 3 times the work of the first.  

    Both static and dynamic scheduling, if used, even out the work
    so that both threads have about the same load.  This could be
    expected to decrease the run time of the loop by about 1/3.

  Licensing:

    This code is distributed under the GNU LGPL license. 

  Modified:

    10 July 2010

  Author:

    John Burkardt
*/
{
  int n;
  int n_factor;
  int n_hi;
  int n_lo;
  int primes;
  double time1;
  double time2;
  double time3;

  printf ( "\n" );
  printf ( "SCHEDULE_OPENMP\n" );
  printf ( "  C/OpenMP version\n" );
  printf ( "  Count the primes from 1 to N.\n" );
  printf ( "  This is an unbalanced work load, particular for two threads.\n" );
  printf ( "  Demonstrate default, static and dynamic scheduling.\n" );
  printf ( "\n" );
  printf ( "  Number of processors available = %d\n", omp_get_num_procs ( )  );
  printf ( "  Number of threads =              %d\n", omp_get_max_threads ( )  );

  n_lo = 1;
  n_hi = 131072;
  n_factor = 2;

  printf ( "\n" );
  printf ( "                           Default        Static       Dynamic\n" );
  printf ( "         N     Pi(N)          Time          Time          Time\n" );
  printf ( "\n" );

  n = n_lo;

  while ( n <= n_hi )
  {
    time1 = omp_get_wtime ( );
    primes = prime_default ( n );
    time1 = omp_get_wtime ( ) - time1;

    time2 = omp_get_wtime ( );
    primes = prime_static ( n );
    time2 = omp_get_wtime ( ) - time2;

    time3 = omp_get_wtime ( );
    primes = prime_dynamic ( n );
    time3 = omp_get_wtime ( ) - time3;

    printf ( "  %8d  %8d  %12f  %12f  %12f\n", n, primes, time1, time2, time3 );

    n = n * n_factor;
  }
/*
  Terminate.
*/
  printf ( "\n" );
  printf ( "SCHEDULE_OPENMP\n" );
  printf ( "  Normal end of execution.\n" );

  return 0;
}
Beispiel #25
0
int main(int argc, char const *argv[])
{
	int matrixSize = strtol(argv[1], NULL, 10);
	int coreCount = omp_get_num_procs();
	int threadCount = strtol(argv[2], NULL, 10);
	double startTime, finishTime;
	double **a_augmented, **a;	// n x n Matrix as a 2D array
	double diagonalElement, bestElement, factor;
	int bestRowIndex = 0; 	// used in partial pivoting (index of row having greatest absolute value)
	int i, j, k;			// for loop counters
	double *x;				// Solutions
	double *b;

	printf("Matrix Size: %d\n", matrixSize);
	printf("Number of Cores: %d\n", coreCount);

	#pragma omp parallel num_threads(threadCount)
	{
		if (omp_get_thread_num() == 0)
			printf("Thread Count: %d\n", omp_get_num_threads());
	}

	// Start Timer
	startTime = omp_get_wtime();

	// Allocate memory
	// a_augmented will be the augmented matrix
	a_augmented = (double **) malloc(matrixSize * sizeof(double *));
	// a will be the randomly generated matrix
	a = (double **) malloc(matrixSize * sizeof(double *));
	x = (double *) malloc(matrixSize * sizeof(double));
	b = (double *) malloc(matrixSize * sizeof(double));
	
	if (DEBUG == 1)
		Read_matrix(&a, &a_augmented, matrixSize);
	else
		Gen_matrix(&a, &a_augmented, matrixSize, threadCount);

	// a will not be modified after this point
	// Only the a_augmented will be modified 

	// Display generated matrix:
	displayMatrix(a, matrixSize);

	for (i = 0; i < matrixSize - 1; ++i)
	{
		// Partial Pivoting: 
		// the algorithm selects the entry with largest absolute value from 
		// the column of the matrix that is currently being considered as 
		// the pivot element. 

		// Diagonal Element
		diagonalElement = a_augmented[i][i];
		// debug_printf("diagonalElement%d = %f\n", i, diagonalElement);

		// Find the best row (the one with the largest absolute value in the 
		// column being worked on)
		bestRowIndex = i;
		bestElement = diagonalElement;
		for (j = i + 1; j < matrixSize; ++j)
		{
			if (fabs(a_augmented[j][i]) > fabs(bestElement))
			{
				bestRowIndex = j;
				bestElement = a_augmented[j][i];
				// debug_printf("bestElement = %f\n", a_augmented[j][i]);
			}
		}

		// Swap the rows
		if (i != bestRowIndex)
		{
			// debug_printf("Row %d needs to be swapped with Row %d\n", i, bestRowIndex );
			swapRow(&a_augmented[i], &a_augmented[bestRowIndex]);	
			// Update the diagonal element
			diagonalElement = a_augmented[i][i];
			// debug_printf("diagonalElement%d = %f\n", i, diagonalElement);
			// displayMatrix(a_augmented, matrixSize);
		}

		// End of Partial Pivoting

		// To make the diagonal element 1, 
		// divide the whole row with the diagonal element
		// debug_printf("Row %d = Row %d / %f\n", i, i, diagonalElement);
		for (j = 0; j < matrixSize + 1; ++j)
		{
			a_augmented[i][j] = a_augmented[i][j] / diagonalElement;
		}

		// Force the diagonal to be 1 (to avoid any roundoff errors in dividing above)
		a_augmented[i][i] = 1;
		diagonalElement = 1;

		// debug_printf("Annihilation of column %d...\n", i);
		// Annihilation: Zero all the elements in the column below the diagonal element
		#pragma omp parallel for num_threads(threadCount) \
			default(none) private(j, factor, k) shared(i, matrixSize, a_augmented)
		for (j = i + 1; j < matrixSize; ++j)
		{
			// sleep(1);
			factor = a_augmented[j][i];
			if (factor != 0)
			{
				// debug_printf("Row %d = Row %d - %f*Row %d\n", j, j, factor, i);
				for (k = i; k < matrixSize + 1; ++k)
				{
					a_augmented[j][k] = a_augmented[j][k] - factor * a_augmented[i][k];
				}
				// displayAugmentedMatrix(a, matrixSize);
			}
		}
	}

	// Make the diagonal element of the last row 1
	a_augmented[matrixSize-1][matrixSize] = a_augmented[matrixSize-1][matrixSize] / a_augmented[matrixSize-1][matrixSize-1];
	a_augmented[matrixSize-1][matrixSize-1] = 1;

	// Display augmented matrix:
	displayMatrix(a_augmented, matrixSize);

	// Back substitution (parallelized)
	backSubstitution(&a_augmented, matrixSize, threadCount);

	// Record the finish time
	finishTime = omp_get_wtime();

	displayMatrix(a_augmented, matrixSize);

	// Matrix X from augmented matrix
	// Vector b from matrix A
	for (i = 0; i < matrixSize; ++i)
	{
		x[i] = a_augmented[i][matrixSize];
		b[i] = a[i][matrixSize];
	}

	// Find I^2 norm
	iSquaredNorm(&a, x, b, matrixSize, threadCount);

	// Print the time taken
	printf("Time taken = %f\n", finishTime - startTime);


	// Free memory
	for (i = 0; i < matrixSize; ++i)
	{
		free(a[i]);
		free(a_augmented[i]);
	}
	free(a);
	free(a_augmented);
	free(x);
	free(b);
	return 0;
}
Beispiel #26
0
int main(int argc, char* argv[])
{
  size_t ndevices = 0;
  if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) {
    LIBXSTREAM_PRINT0(2, "No device found or device not ready!");
  }

  size_t filesize = 0;
  FILE *const file = 1 < argc ? fileopen(argv[1], "rb", &filesize) : 0;
  const size_t nitems = (1 < argc && 0 == filesize && 0 < atoi(argv[1])) ? (atoi(argv[1]) * (1ULL << 20)/*MB*/) : (0 < filesize ? filesize : (512 << 20));
  const size_t mbatch = LIBXSTREAM_MIN(2 < argc ? strtoul(argv[2], 0, 10) : 0/*auto*/, nitems >> 20) << 20;
  const size_t mstreams = LIBXSTREAM_MIN(LIBXSTREAM_MAX(3 < argc ? atoi(argv[3]) : 2, 0), LIBXSTREAM_MAX_NSTREAMS);
#if !defined(_OPENMP)
  LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!");
#endif
  const size_t nstreams = LIBXSTREAM_MAX(mstreams, 1) * LIBXSTREAM_MAX(ndevices, 1), nbatch = (0 == mbatch) ? (nitems / nstreams) : mbatch, hsize = 256;
  size_t histogram[256/*hsize*/];
  memset(histogram, 0, sizeof(histogram));

  char* data;
  { /*allocate and initialize host memory*/
    size_t i;
    LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(-1/*host*/, (void**)&data, nitems, 0));
    if (0 == filesize || nitems > fread(data, 1, filesize, file)) {
      for (i = 0; i < nitems; ++i) data[i] = (char)LIBXSTREAM_MOD(rand(), hsize/*POT*/);
    }
  }

  struct {
    libxstream_stream* handle;
#if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD)
    libxstream_event* event;
#endif
    size_t* histogram;
    char* data;
  } stream[(LIBXSTREAM_MAX_NDEVICES)*(LIBXSTREAM_MAX_NSTREAMS)];

  { /*allocate and initialize streams and device memory*/
    size_t i;
    for (i = 0; i < nstreams; ++i) {
#if defined(NDEBUG) /*no name*/
      const char *const name = 0;
#else
      char name[128];
      LIBXSTREAM_SNPRINTF(name, sizeof(name), "stream %i", (int)(i + 1));
#endif
      const int device = (0 < ndevices) ? ((int)(i % ndevices)) : -1;
      stream[i].handle = 0;
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_create(0 < mstreams ? &stream[i].handle : 0, device, 0, name));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].data, nbatch, 0));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].histogram, hsize * sizeof(size_t), 0));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memset_zero(stream[i].histogram, hsize * sizeof(size_t), stream[i].handle));
#if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD)
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_create(&stream[i].event));
#endif
    }

    /*start benchmark with no pending work*/
    LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(0));
  }

  /*process data in chunks of size nbatch*/
  const size_t nstep = nbatch * nstreams;
  const int end = (int)((nitems + nstep - 1) / nstep);
  int i;
  libxstream_type sizetype = LIBXSTREAM_TYPE_U32;
  LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_autotype(sizeof(size_t), sizetype, &sizetype));
#if defined(_OPENMP)
  /*if (0 == ndevices) omp_set_nested(1);*/
  const double start = omp_get_wtime();
#endif
  for (i = 0; i < end; ++i) {
    const size_t ibase = i * nstep, n = LIBXSTREAM_MIN(nstreams, nitems - ibase);
    libxstream_argument* signature;
    size_t j;

    for (j = 0; j < n; ++j) { /*enqueue work into streams*/
      const size_t base = ibase + j * nbatch, size = base < nitems ? LIBXSTREAM_MIN(nbatch, nitems - base) : 0;
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(data + base, stream[j].data, size, stream[j].handle));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(signature, 0, stream[j].data, LIBXSTREAM_TYPE_CHAR, 1, &size));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 1, stream[j].histogram, sizetype, 1, &hsize));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call((libxstream_function)makehist, signature, stream[j].handle, LIBXSTREAM_CALL_DEFAULT));
#if defined(SYNCMETHOD) && (2 <= SYNCMETHOD) /*record event*/
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_record(stream[j].event, stream[j].handle));
#endif
    }

#if defined(SYNCMETHOD)
    for (j = 0; j < n; ++j) { /*synchronize streams*/
      const size_t k = n - j - 1; /*j-reverse*/
# if (3 <= (SYNCMETHOD))
      /*wait for an event within a stream*/
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait_event(stream[k].handle, stream[(j+nstreams-1)%n].event));
# elif (2 <= (SYNCMETHOD))
      /*wait for an event on the host*/
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_wait(stream[k].event));
# else
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[k].handle));
# endif
    }
#endif
  }

  { /*reduce stream-local histograms*/
    LIBXSTREAM_ALIGNED(size_t local[256/*hsize*/], LIBXSTREAM_MAX_SIMD);
    size_t i, j;
    for (j = 0; j < nstreams; ++j) {
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(stream[j].histogram, local, sizeof(local), stream[j].handle));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[j].handle)); /*wait for pending work*/
      for (i = 0; i < hsize; ++i) histogram[i] += local[i];
    }
  }

#if defined(_OPENMP)
  const double duration = omp_get_wtime() - start;
#endif
  const double kilo = 1.0 / (1 << 10), mega = 1.0 / (1 << 20);
  double entropy = 0;
  { /*calculate entropy*/
    const double log2_nitems = log2((double)nitems);
    size_t i;
    for (i = 0; i < hsize; ++i) {
      const double h = (double)histogram[i], log2h = 0 < h ? log2(h) : log2_nitems;
      entropy -= h * LIBXSTREAM_MIN(log2h - log2_nitems, 0);
    }
    entropy /= nitems;
  }

  if (0 < entropy) {
    if ((1 << 20) <= nitems) { /*mega*/
      fprintf(stdout, "Compression %gx: %.1f -> %.1f MB", 8.0 / entropy, mega * nitems, mega * entropy * nitems / 8.0);
    }
    else if ((1 << 10) <= nitems) { /*kilo*/
      fprintf(stdout, "Compression %gx: %.1f -> %.1f KB", 8.0 / entropy, kilo * nitems, kilo * entropy * nitems / 8.0);
    }
    else  {
      fprintf(stdout, "Compression %gx: %.0f -> %0.f B", 8.0 / entropy, 1.0 * nitems, entropy * nitems / 8.0);
    }
    fprintf(stdout, " (redundancy %0.f%%, entropy %.0f bit)\n", 100.0 - 12.5 * entropy, entropy);
  }

#if defined(_OPENMP)
  if (0 < duration) {
    fprintf(stdout, "Finished after %.1f s", duration);
  }
  else {
    fprintf(stdout, "Finished");
  }
#endif

  { /*validate result*/
    size_t check = 0, i;
    for (i = 0; i < hsize; ++i) check += histogram[i];
    if (nitems != check) {
      size_t expected[256/*hsize*/];
      memset(expected, 0, sizeof(expected));
      LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, nitems, expected); check = 0;
      for (i = 0; i < hsize; ++i) check += expected[i] == histogram[i] ? 0 : 1;
      fprintf(stdout, " with %llu error%s\n", (unsigned long long)check, 1 != check ? "s" : "");
    }
    else {
      fprintf(stdout, "\n");
    }
  }

  { /*release resources*/
    size_t i;
    for (i = 0; i < nstreams; ++i) {
      int device = -1;
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_device(stream[i].handle, &device));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].histogram));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].data));
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_destroy(stream[i].handle));
#if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD)
      LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_destroy(stream[i].event));
#endif
    }
    LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1/*host*/, data));
  }

  return EXIT_SUCCESS;
}
Beispiel #27
0
void mm_tst_cases(int NTRIALS, int Ndim, int Mdim, int Pdim, 
              TYPE* A, TYPE* B, TYPE* C, 
              void (*mm_func)(int, int, int, TYPE *, TYPE *, TYPE *))
{
   int    nerr, itrials;
   double err,  errsq, mflops;
   double start_time, run_time;
   double min_t, max_t, ave_t;
   TYPE *Cref;

   Cref = (TYPE *) malloc (Ndim * Mdim * sizeof(TYPE));

   /* Initialize matrices */

   init_const_matrix (Ndim, Mdim, Pdim, A, B, Cref);

   printf("\n constant matrices  %d %d %d\n", Ndim, Mdim, Pdim);
   nerr = 0; min_t = BIG;  max_t = SMALL; ave_t = (double) 0.0;
   for (itrials = 0; itrials<NTRIALS; itrials++){

      mm_clear(Ndim, Mdim, C);
      start_time = omp_get_wtime(); 

      mm_func(Ndim, Mdim, Pdim, A, B, C);

      run_time = omp_get_wtime() - start_time;
  
      errsq = errsqr(Ndim, Mdim, C, Cref);
      if (errsq > TOL) nerr++;
      if(run_time < min_t) min_t = run_time;
      if(run_time > max_t) max_t = run_time;
      ave_t += run_time;
   }

   ave_t = ave_t/(double)NTRIALS;
   output_results(Ndim, Mdim, Pdim, nerr, ave_t, min_t, max_t);

   init_progression_matrix (Ndim, Mdim, Pdim, A, B, Cref);

#ifdef DEBUG
   printf(" A progression Matrix input\n");
   mm_print(Ndim, Pdim, A);

   printf(" B progression Matrix input\n");
   mm_print(Pdim, Mdim, B);

   printf(" C Reference Matrix\n");
   mm_print(Ndim, Mdim, Cref);
#endif

   printf("\n progression matrices  %d %d %d\n", Ndim, Mdim, Pdim);
   nerr = 0; min_t = BIG;  max_t = SMALL; ave_t = (double) 0.0;
   for (itrials = 0; itrials<NTRIALS; itrials++){

      mm_clear(Ndim, Mdim, C);
      start_time = omp_get_wtime(); 

      mm_func(Ndim, Mdim, Pdim, A, B, C);

      run_time = omp_get_wtime() - start_time;

#ifdef DEBUG
   printf(" C progression Matrix result\n");
   mm_print(Ndim, Mdim, C);
#endif
      errsq = errsqr(Ndim, Mdim, C, Cref);
      if (errsq > TOL) nerr++;
      if(run_time < min_t) min_t = run_time;
      if(run_time > max_t) max_t = run_time;
      ave_t += run_time;
   }

   ave_t = ave_t/(double)NTRIALS;
   output_results(Ndim, Mdim, Pdim, nerr, ave_t, min_t, max_t);
}
Beispiel #28
0
int main (int argc, char *argv[]) {
  int	tid, nthreads, i, j, k;
  double **a, **b, **c;
  double *a_block, *b_block, *c_block;
  double **res;
  double *res_block;
  double starttime, stoptime;

  a = (double **) malloc(NRA*sizeof(double *)); /* matrix a to be multiplied */
  b = (double **) malloc(NCA*sizeof(double *)); /* matrix b to be multiplied */
  c = (double **) malloc(NRA*sizeof(double *)); /* result matrix c */

  a_block = (double *) malloc(NRA*NCA*sizeof(double)); /* Storage for matrices */
  b_block = (double *) malloc(NCA*NCB*sizeof(double));
  c_block = (double *) malloc(NRA*NCB*sizeof(double));

  /* Result matrix for the sequential algorithm */
  res = (double **) malloc(NRA*sizeof(double *));
  res_block = (double *) malloc(NRA*NCB*sizeof(double));

  for (i=0; i<NRA; i++)   /* Initialize pointers to a */
    a[i] = a_block+i*NRA;

  for (i=0; i<NCA; i++)   /* Initialize pointers to b */
    b[i] = b_block+i*NCA;
  
  for (i=0; i<NRA; i++)   /* Initialize pointers to c */
    c[i] = c_block+i*NRA;

  for (i=0; i<NRA; i++)   /* Initialize pointers to res */
    res[i] = res_block+i*NRA;

  /* A static allocation of the matrices would be done like this */
  /* double a[NRA][NCA], b[NCA][NCB], c[NRA][NCB];  */

  /*** Spawn a parallel region explicitly scoping all variables ***/
#pragma omp parallel shared(a,b,c,nthreads) private(tid,i,j,k) num_threads(NR_THREADS)
  {
    tid = omp_get_thread_num();
    if (tid == 0) {  /* Only thread 0 prints */
      nthreads = omp_get_num_threads();
      printf("Starting matrix multiplication with %d threads\n",nthreads);
      printf("Initializing matrices...\n");
    }
    /*** Initialize matrices ***/
#pragma omp for nowait    /* No need to synchronize the threads before the */
    for (i=0; i<NRA; i++) /* last matrix has been initialized */
      for (j=0; j<NCA; j++)
	a[i][j]= (double) (i+j);
#pragma omp for nowait
    for (i=0; i<NCA; i++)
      for (j=0; j<NCB; j++)
	b[i][j]= (double) (i*j);
#pragma omp for   /* We synchronize the threads after this */
    for (i=0; i<NRA; i++)
      for (j=0; j<NCB; j++)
	c[i][j]= 0.0;

    if (tid == 0) /* Thread zero measures time */
      starttime = omp_get_wtime();  /* Master thread measures the execution time */
    
    /* Do matrix multiply sharing iterations on outer loop */
    /* If DEBUG is TRUE display who does which iterations */
    /* printf("Thread %d starting matrix multiply...\n",tid); */
#pragma omp for
    for (i=0; i<NRA; i++) {
      if (DEBUG) printf("Thread=%d did row=%d\n",tid,i);
      for(j=0; j<NCB; j++) {    
	for (k=0; k<NCA; k++) {
	  c[i][j] += a[i][k] * b[k][j];
	}
      }
    }

    if (tid == 0) {
      stoptime = omp_get_wtime();
      printf("Time for parallel matrix multiplication: %3.2f s\n", 
	     stoptime-starttime);
    }
  }   /*** End of parallel region ***/
  
  starttime = omp_get_wtime();
  /* Do a sequential matrix multiplication and compare the results */
  for (i=0; i<NRA; i++) {
    for (j=0; j<NCB; j++) {
      res[i][j] = 0.0;
      for (k=0; k<NCA; k++)
	res[i][j] += a[i][k]*b[k][j];
    }
  }
  stoptime = omp_get_wtime();
  printf("Time for sequential matrix multiplication: %3.2f s\n", stoptime-starttime);

  /* Check that the results are the same as in the parallel solution.
     Actually, you should not compare floating point values for equality like this
     but instead compute the difference between the two values and check that it
     is smaller than a very small value epsilon. However, since all values in the
     matrices here are integer values, this will work.
  */
  for (i=0; i<NRA; i++) {
    for (j=0; j<NCB; j++) {
      if (res[i][j] == c[i][j]) {
	/* Everything is OK if they are equal */
      }
      else {
	printf("Different result %5.1f != %5.1f in %d %d\n ", res[i][j], c[i][j], i, j);
      }
    }
  }

  /* If DEBUG is true, print the results. Usa smaller matrices for this */
  if (DEBUG) {
    printf("Result Matrix:\n");
    for (i=0; i<NRA; i++) {
      for (j=0; j<NCB; j++) 
	printf("%6.1f ", c[i][j]);
      printf("\n"); 
    }
  }

  printf ("Done.\n");
  exit(0);
}
int main(int argc, char* argv[])   
{      
	int i, j; 
	double start, time1, time2;

   int M = MM;
   int N = NN;
   int P = PP;
 
   if (argc != 4) {
      printf("Suggested Usage: %s <M> <N> <P> \n", argv[0]);
      printf("Using default values\n");
   }
   else {
      M = atoi(argv[1]);
      N = atoi(argv[2]);
      P = atoi(argv[3]);
   }

	double  **A = Allocate2DArray< double >(M, P);
	double  **B = Allocate2DArray< double >(P, N);

	double **C1 = Allocate2DArray< double >(M, N);
	double **C4 = Allocate2DArray< double >(M, N);

	for (i = 0; i < M; i++) {   
		for (j = 0; j < P; j++) {   
			A[i][j] = (double)(rand()%100) / 10.0;   
		}      
	}   

	for (i = 0; i < P; i++) {   
		for (j = 0; j < N; j++) {   
			B[i][j] = (double)(rand()%100) / 10.0;   
		}      
	}   

   printf("Matrix Dimensions: M = %d  P = %d  N = %d\n\n", M, P, N);
	printf("Execute matmult1\n");
	start = omp_get_wtime();
	matmult1(M, N, P, A, B, C1);
	time1 = omp_get_wtime() - start;
	printf("Time = %f seconds\n\n",time1);

	printf("Execute matmultr\n");
	start = omp_get_wtime();
	matmultr(M, N, P, A, B, C4);
	time2 = omp_get_wtime() - start;
	printf("Time = %f seconds\n\n",time2);

   printf("Checking...");
   if (CheckResults(M, N, C1, C4))
     printf("Error in Recursive Matrix Multiplication\n\n");
   else {
     printf("OKAY\n\n");
     printf("Speedup = %5.1fX\n", time1/time2);
   }


	Free2DArray< double >(A);
	Free2DArray< double >(B);
	Free2DArray< double >(C1);
	Free2DArray< double >(C4);

	return 0;   
}  
Beispiel #30
0
double
timer (void)
{
  return omp_get_wtime ();
}