Esempio n. 1
0
void fastSineTransformInv(ColumnMatrix localMatrix, Vector fstBuffer)
{
	int problemSize = localMatrix->globalSize+1;

	for(int column=0;column<localMatrix->localSize;column++)
	{	
		fstinv_(&localMatrix->data[column*localMatrix->globalSize], &problemSize, fstBuffer->data, &fstBuffer->localSize);
	}
}
Esempio n. 2
0
int main(int argc, char **argv)
{
  double wall_start = MPI_Wtime();
  Real *diag, **b, **bt, **z;
  Real pi, h, omp_local_max, local_max, global_max;
  int i, j, omp_id;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

  omp_tot_threads = omp_get_max_threads();

  /* the total number of grid points in each spatial direction is (n+1) */
  /* the total number of degrees-of-freedom in each spatial direction is (n-1) */
  /* this version requires n to be a power of 2 */

  if (argc < 2) {
    if (mpi_rank == 0){
      printf("need a problem size\n");
    }
    MPI_Finalize();
    return 0;
  }

  n  = atoi(argv[1]);
  m  = n-1;
  // mpi_work is the amount of work needed to be done by each mpi node. The last
  // mpi node may do slightly less work than the others, but that's the closest
  // we'll get to proper load balancing.
  mpi_work = 1 + ((m - 1) / mpi_size);
  nn = 4*n;

  diag = createRealArray (m);
  b    = createReal2DArray (mpi_work, mpi_size*mpi_work);
  bt   = createReal2DArray (mpi_work, mpi_size*mpi_work);
  z    = createReal2DArray (omp_tot_threads, nn);

  h    = 1./(Real)n;
  pi   = 4.*atan(1.);
  
  #pragma omp parallel for private(i)
  for (i=0; i < m; i++) {
    diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n));
  }

  #pragma omp parallel for private(j, i)
  for (j=0; j < mpi_work; j++) { // MPI
    for (i=0; j + mpi_work * mpi_rank < m && i < m; i++) { // OMP
      int k = j + mpi_work * mpi_rank;
      b[j][i] = exp((Real) k) * sin(2.0 * pi * k) * sin(2.0 * i);
    }
  }

  #pragma omp parallel for private(omp_id, i)
  for (j=0; j < mpi_work; j++) { // MPI cut + OMP
    omp_id = omp_get_thread_num();
    fst_(b[j], &n, z[omp_id], &nn);
  }
  
  transpose (bt,b);

  #pragma omp parallel for private(i, omp_id) schedule(static)
  for (i=0; i < mpi_work; i++) { // MPI cut + OMP
    omp_id = omp_get_thread_num();
    fstinv_(bt[i], &n, z[omp_id], &nn);
  }

  #pragma omp parallel for private(j, i)
  for (j=0; j < mpi_work; j++) { // MPI
    for (i=0; i < m; i++) {
      bt[j][i] = bt[j][i]/(diag[i]+diag[j + mpi_work * mpi_rank]);
    }
  }

  #pragma omp parallel for private(i, omp_id) schedule(static)
  for (i=0; i < mpi_work; i++) { // MPI cut + OMP
    omp_id = omp_get_thread_num();
    fst_(bt[i], &n, z[omp_id], &nn);
  }

  transpose (b,bt);

  #pragma omp parallel for private(j, omp_id)
  for (j=0; j < mpi_work; j++) { // MPI cut + OMP
    omp_id = omp_get_thread_num();
    fstinv_(b[j], &n, z[omp_id], &nn);
  }

  local_max = 0.0;
  omp_local_max = 0.0;

  #pragma omp parallel shared(local_max) private(j,i) firstprivate(omp_local_max)
  {
    // MPI, work in range (and handle last node overflow)
    #pragma omp for nowait
    for (j=0; j < mpi_work; j++) {
      for (i=0; j + mpi_work * mpi_rank < m && i < m; i++) {
        if (b[j][i] > omp_local_max) omp_local_max = b[j][i];
      }
    }
    #pragma omp critical
    {
      if (omp_local_max > local_max) {
        local_max = omp_local_max;
      }
    }
  }

  MPI_Reduce(&local_max, &global_max, 1,
             MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
               
  free(diag);
  free(b[0]);
  free(b);
  free(bt[0]);
  free(bt);
  free(z[0]);
  free(z);
  MPI_Finalize();
  
  double wall_end = MPI_Wtime();

  if (mpi_rank == 0) {
    printf (" umax = %e, time = %.3fs \n", global_max,wall_end-wall_start);
    printf(" mpi_size = %d, omp_max_threads = %d, n = %d\n", mpi_size, omp_tot_threads, n);
  }
}
Esempio n. 3
0
int main(int argc, char **argv)
{
    MPI_Init(&argc, &argv);
    int nprocs, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int numthreads = omp_get_max_threads();

    if (argc < 2) {
        printf("Usage:\n");
        printf("  poisson n\n\n");
        printf("Arguments:\n");
        printf("  n: the problem size (must be a power of 2)\n");
    }

    double time_start;
    if (rank == 0) {
        time_start = MPI_Wtime();
    }

    // The number of grid points in each direction is n+1
    // The number of degrees of freedom in each direction is n-1 = m
    int n = atoi(argv[1]);
    int m = n - 1;
    int nn = 4 * n;
    real h = 1.0 / n;

    // Splitting the matrix into columns:
    int exact = n/nprocs;
    int rem = m - (nprocs - 1)*exact;
    // Size of each process owns a strip matrix which is m*exact or m*remain.
    // We consider that each such a matrix is made of 'nprocs' blocks vertically.
    int block_col = exact;
    int block_uk = exact*exact;
    int rem_uk = exact*rem;
    // For the last such strip, number of columns is rem. Consequently:
    if (rank == nprocs-1){
        block_col = rem;
        block_uk = rem*exact;
        rem_uk = rem*rem;
    }

    // Grid points
    real *grid = mk_1D_array(n+1, false);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < n+1; i++) {
        grid[i] = i * h;
    }

    // The diagonal of the eigenvalue matrix of T
    real *diag = mk_1D_array(m, false);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < m; i++) {
        diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n));
    }

    // Initialize the right hand side data
    // B is the column strip that the process owns.*
    real **B = mk_2D_array(block_col, m, false);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        for (size_t j = 0; j < m; j++) {
            B[i][j] = h * h * rhs(grid[i+1+(rank*exact)], grid[j+1]);
        }
    }

    // For the Sine Transform:
    real **z = mk_2D_array(numthreads, nn, false);

    // Calculate Btilde^T = S^-1 * (S * B)^T
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fst_(B[i], &n, z[omp_get_thread_num()], &nn);
    }
    transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fstinv_(B[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Solve Lambda * Xtilde = Btilde
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        for (size_t j = 0; j < m; j++) {
            B[i][j] = B[i][j] / (diag[i+(rank*exact)] + diag[j]);
        }
    }

    // Calculate X = S^-1 * (S * Xtilde^T) ^ T
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fst_(B[i], &n, z[omp_get_thread_num()], &nn);
    }
    transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fstinv_(B[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Calculate maximal value of solution
    double U_max = 0.0, e_max = 0.0, global_max, global_emax, error;
    for (size_t i = 0; i < block_col; i++){
        for (size_t j = 0; j < m; j++){
        	error = fabs(B[i][j] - sin(PI*(i+1+(rank*exact))*h)*sin(2*PI*(j+1)*h));
            U_max = U_max > B[i][j] ? U_max : B[i][j];
            e_max = e_max > error ? e_max : error;
        }
    }

    // MPI_Max to find the true maximum:
    MPI_Reduce(&U_max, &global_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&e_max, &global_emax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    // Print the Global Maximum on process 0:
    if (rank == 0){
    	printf("Problem Size = %d\tNumprocs = %d\tNumthreads = %d\n", n, nprocs, numthreads);
        printf("U_max = %0.16f\n", global_max);
		printf("E_max = %0.16f\n", global_emax);
        double duration = MPI_Wtime() - time_start ;
        printf("Execution Time: %0.16f \n", duration);
    }

    MPI_Finalize();
    return 0;
}
void runPoisson(int rank, int size, int n){
  double time=MPI_Wtime();
  Real **b, *diag, *RecvBuf,*z, h, maxError;
  int i, j, m, nn, *len, *disp;

  m  = n-1;
  nn = 4*n;
  splitVector(m, size, &len, &disp);
  diag = createRealArray (m);
  b    = createReal2DArray (len[rank],m);
  z    = createRealArray (nn);
  h    = 1./(Real)n;

  #pragma omp parallel for schedule(static)
  for (i=0; i < m; i++) {
    diag[i] = 2.*(1.-cos((i+1)*M_PI/(Real)n));
  }

  #pragma omp for
  for (j=0; j < len[rank]; j++) {
  #pragma omp parallel for schedule(static)
    for (i=0; i < m; i++) {
      Real x=(Real)(j+1+disp[rank])/n;
      Real y=(Real) (i+1)/n;
      b[j][i] = h*h * funcf(x,y);
    }
  }

  #pragma omp parallel for schedule(static)
  for (j=0; j < len[rank]; j++) {
    Real* zt = createRealArray (nn);
    fst_(b[j], &n, zt, &nn);
    free(zt);
  }

  transpose(b, size, len, disp, rank, m);

  #pragma omp parallel for schedule(static)
  for (i=0; i < len[rank]; i++) {
    Real* zt  = createRealArray (nn);
    fstinv_(b[i], &n, zt, &nn);
    free(zt);
  }

  #pragma omp for
  for (j=0; j < len[rank]; j++) {
  #pragma omp parallel for schedule(static)
    for (i=0; i < m; i++) {
      b[j][i] = b[j][i]/(diag[i]+diag[j+disp[rank]]);
    }
  }

  #pragma omp parallel for schedule(static)
  for (i=0; i < len[rank]; i++) {
    Real* zt  = createRealArray (nn);
    fst_(b[i], &n, zt, &nn);
    free(zt);
  }

  transpose(b, size, len, disp, rank, m);

  #pragma omp parallel for schedule(static)
  for (j=0; j < len[rank]; j++) {
    Real* zt  = createRealArray (nn);
    fstinv_(b[j], &n, zt, &nn);
    free(zt);
  }




  if (rank==0)
  {
    RecvBuf = createRealArray (m*m);
  }
  gatherMatrix(b, m, RecvBuf, len, disp,0);

  if (rank==0)
  {
    for (int j=0; j < m; j++) {
      for (int i=0; i < m; i++) {
        printf("%e %e %e \n",(Real)i/m,(Real)j/m,RecvBuf[j*m+i] );
      }
    }
  }
}
Esempio n. 5
0
main(int argc, char **argv )
{
	Real *diag, **b, **bt, *z;
	Real pi, h, umax;
	int i, j, n, m, nn;

	/* the total number of grid points in each spatial direction is (n+1) */
	/* the total number of degrees-of-freedom in each spatial direction is (n-1) */
	/* this version requires n to be a power of 2 */

	if( argc < 2 ) {
		printf("need a problem size\n");
		return;
	}

	n  = atoi(argv[1]);
	m  = n-1;
	nn = 4*n;

	diag = createRealArray (m);
	b    = createReal2DArray (m,m);
	bt   = createReal2DArray (m,m);
	z    = createRealArray (nn);

	h    = 1./(Real)n;
	pi   = 4.*atan(1.);

	for (i=0; i < m; i++) {
		diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n));
	}
	for (j=0; j < m; j++) {
		for (i=0; i < m; i++) {
			b[j][i] = h*h;
		}
	}
	for (j=0; j < m; j++) {
		fst_(b[j], &n, z, &nn);
	}

	transpose (bt,b,m);

	for (i=0; i < m; i++) {
		fstinv_(bt[i], &n, z, &nn);
	}

	for (j=0; j < m; j++) {
		for (i=0; i < m; i++) {
			bt[j][i] = bt[j][i]/(diag[i]+diag[j]);
		}
	}

	for (i=0; i < m; i++) {
		fst_(bt[i], &n, z, &nn);
	}

	transpose (b,bt,m);

	for (j=0; j < m; j++) {
		fstinv_(b[j], &n, z, &nn);
	}

	umax = 0.0;
	for (j=0; j < m; j++) {
		for (i=0; i < m; i++) {
			if (b[j][i] > umax) umax = b[j][i];
		}
	}
	printf (" umax = %e \n",umax);
}
Esempio n. 6
0
int main(int argc, char **argv)
{
 init_mpi(argc, argv,rank,grid_size,dims,coords,periods);
 if(grid_size==0) grid_size =1;/*For debugging and with only 1 processor*/ 
 if( argc < 2 ) {
    printf("need a problem size\n");
    return 1;
  }
  
  double pi, h, l_umax;
  int n, m, nn, i, j, cols, l_diag_rows; 
  n = atoi(argv[1]);
  m = n-1; 
  nn = 4*n; 
  
  struct Array *len	= newArray(grid_size);
  struct Array *displ	= newArray(grid_size);  
  
  splitVector(m,grid_size,) 
  cols 	= m/grid_size;	/*number of coulums of b to each processor*/
  l_diag_rows =m/grid_size; 
  struct Array *diag = newArray(m,1);
  struct Array *l_diag = newArray(l_diag_rows,1);
  struct Array *b 	= newArray(m,cols);
  struct Array *bt	= newArray(m,cols);
  struct Array *z	= newArray(nn,1); 

  createDatatype(b,&grid_size);

  h 	= 1./(double)n;
  pi	= 4.*atan(1.);

  /*struct timeval start, end;
  gettimeofday(&start,NULL);
*/
  for(i=0;i<diag->rows;i++){
    diag->data[i] = 2.*(1.-cos((i+1)*pi/(double)n));
} 

/*
 #pragma omp for schedule(static)
  for(i=0+rank*l_diag->rows;i<((rank+1)*l_diag->rows);i++){
    *(l_diag->data+i) = 2.*(1.-cos((i+1)*pi/(double)n));
  }
  MPI_Allgather(l_diag->data,l_diag->rows,MPI_DOUBLE,diag->data,diag->size,MPI_DOUBLE,cart_comm);
*/
  #pragma omp parallel for schedule(static) 
  for(i=0;i<m;i++){
    b->data[i] = h*h;
  }
  
  #pragma omp parallel for schedule(static)
  for(i=0;i<b->cols;i++){
    fst_(b->data+b->rows*i, &b->rows, z->data,&nn); 
  }
  /*Transpose b by sending/receiving rows of the columns in each local b; eg (l_b_0 = {1,2,a,b})+(l_b_1 = {3,4,c,d}) -> b_all = {1,a,3,c,2,b,4,d}*/
  MPI_Alltoall(b->data,grid_size,transpose_select_t,bt->data,1,transpose_insert_t,cart_comm);

  #pragma omp parallel for schedule(static)
  for(i=0;i<m;i++){
    fstinv_(bt->data+i*bt->rows,&bt->rows,z->data,&nn);
  }
  #pragma omp parallel for schedule(static) private(i)
  for(j=0;j<m;j++){
    for(i=0;i<cols;i++){
      *(bt->data + j*bt->rows + i)=*(bt->data + j*bt->rows + i)/(*(diag->data+i)+*(diag->data+j));
    }
  }
  #pragma omp parallel for schedule(static)
  for(i=0;i<cols;i++){
    fst_(bt->data+i*b->rows, &bt->rows, z->data,&nn); 
  }

  MPI_Alltoall(bt->data,grid_size,transpose_select_t,b->data,1,transpose_insert_t,cart_comm);
  
  #pragma omp parallel for schedule(static)
  for(i=0;i<m;i++){
    fstinv_(b->data+i*b->rows,&b->rows,z->data,&nn);
  }
 
  l_umax = 0.0;
  
  #pragma omp parallel for schedule(static) reduction(max:l_umax)
  for(i=0;i<b->size;i++){
    if(l_umax<*(b->data +i)) l_umax=*(b->data+i);
  }
  if(rank==0){
    double umax = 0.0;
    struct Array *umaxArray = newArray(grid_size,1);
    MPI_Gather(&l_umax,1,MPI_DOUBLE,umaxArray->data,grid_size,MPI_DOUBLE,0,cart_comm);
    #pragma omp parallel for schedule(static) reduction(max:umax)
    for(i=0;i<grid_size;i++){
       if(umax<*(umaxArray->data +i)) umax=*(umaxArray->data+i);
    } 
     printf (" umax = %e \n",umax);
     /*gettimeofday(&end,NULL);
     print_time(start,end);
     */
  }

  freeArray(b);
  freeArray(bt);
  freeArray(l_diag);
  freeArray(diag);
  freeArray(z);
  freeDatatype();


  MPI_Finalize();

 return 0;
}
Esempio n. 7
0
int main(int argc, char **argv)
{

    int size , rank, number; 

    MPI_Init(&argc, &argv); 
    MPI_Comm_size(MPI_COMM_WORLD , &size);
    MPI_Comm_rank(MPI_COMM_WORLD , &rank);

    double start =  MPI_Wtime(); //Starter klokka
    double umaxglob=0; //Max feil for alle tråder

    if (argc < 2) {
        printf("Usage:\n");
        printf("  poisson n\n\n");
        printf("Arguments:\n");
        printf("  n: the problem size (must be a power of 2)\n");
    }

    // The number of grid points in each direction is n+1
    // The number of degrees of freedom in each direction is n-1
    int n = atoi(argv[1]);
    
    int m = n - 1;  // ant punk hver vei i B

    int *cnt = (int *) malloc(size * sizeof(int)); //loakal ant kolonner i matrix
    int *displs = (int *) malloc((size+1) * sizeof(int)); //lokal displacement for de andre prosessorene sine punkter i sendbuf
    displs[size] = m;
    displs[0]=0; //Displacement til første prosessor er alltid 0


   int  overflow = m % size; //ant kolonner som blir til overs 

    for(int i = 0;i<size;i++){
        cnt[i] = m / size; // nrColon for hver prosessor  
        if (overflow != 0){ 
            cnt[i]++; //fordeler de ekstra kolonnene
            overflow--;
        }
        if (i < size-1){
            displs[i+1] = displs[i]+cnt[i];
        }

    }
 
    int nrColon = cnt[rank]; //ant kolonner "jeg" har
    int pros_dof = nrColon*m;  //ant lementer jeg har


    int nn = 4 * n;
    double h = 1.0 / n;


    // Grid points
    double *grid = mk_1D_array(n+1, false);
    double **b = mk_2D_array(nrColon, m, false);
    double **bt = mk_2D_array(nrColon, m,false);

    int trad = omp_get_max_threads(); //ant tråder 
    double **z = mk_2D_array(trad,nn, false); //z er 2D pga paralellisering med OpenMP, da FST ikke skal overskrive andre tråders z

    double *diag = mk_1D_array(m, false);     
    double *sendbuf = mk_1D_array(nrColon*m, false);
    double *recbuf = mk_1D_array(nrColon*m, false); 


    int *sendcnt = (int *) malloc((size+1) * sizeof(int)); //ant elementer jeg skal sende hver prosessor 
    int *sdispls = (int *) malloc((size+1) * sizeof(int)); //index i sendbuf for hver prosessor

 

    sdispls[0]=0; //prosessor 0 skal alltid ha fra index 0
    for(int i = 0;i<size;i++){
        sendcnt[i] = cnt[i]*cnt[rank]; //  antt elementer jeg eier * ant element den eier
        sdispls[i] = displs[i]*cnt[rank]; //displacement for hver prosessor
    }

    // GRID
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n+1; i++) {
        grid[i] = i * h;
    }




    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < m; i++) {
        diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); //Eigenvalue
      }

  // Initialize the right hand side data 
    
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        for (size_t j = 0; j < m; j++) {
        //  b[i][j] = h * h;
            b[i][j] = h * h * func1(grid[i+displs[rank]], grid[j]); //evaluerer funksjoen * h*h
        }
    }

    // Calculate Btilde^T = S^-1 * (S * B)^T 
 
    #pragma omp parallel for schedule(guided, 5)
    for (size_t i = 0; i < nrColon; i++) {
        fst_(b[i], &n, z[omp_get_thread_num()], &nn);
    }
    MPItranspose (b, bt,nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fstinv_(bt[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Solve Lambda * Xtilde = Btilde

    #pragma omp parallel for schedule(static)

    for (int j=0; j < nrColon; j++) {
       for (int i=0; i < m; i++) {
            bt[j][i] /= (diag[j+displs[rank]]+diag[i]);
        }
    }

    // Calculate X = S^-1 * (S * Xtilde^T)
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fst_(bt[i], &n, z[omp_get_thread_num()], &nn);

    }
    MPItranspose (bt, b, nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fstinv_(b[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Calculate maximal value of solution
    double u_max = 0.0, temp;

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        for (size_t j = 0; j < m; j++) {
            temp = b[i][j] - func2(grid[displs[rank]+i], grid[j]);  //tester resultat - kjent funksjon, skal bli = 0
            if (temp > u_max){
                u_max = temp;
            }
        }
    }
    MPI_Reduce (&u_max, &umaxglob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); //Finner den største u_max fra de forskjellige prosessorene og setter den til umaxglob 

    MPI_Finalize();

    if (rank == 0) {
        printf("Nodes = %d \n", size);
        printf("Threads per node = %d \n", omp_get_max_threads());
        printf("u_max = %e\n", umaxglob);  //Printer max feil
        double times = MPI_Wtime()-start; //Stopper klokka
        printf("Time elapsed = %1.16f \n", times); //Pinter tid
    }
    return 0;
}
Esempio n. 8
0
main(int argc, char **argv )
{
  Real *diag, **b, **bt, **z;
  Real pi, h, umax, f;
  int i,j,n, m, nn, numthreads, currentthread;

  

  /* the total number of grid points in each spatial direction is (n+1) */
  /* the total number of degrees-of-freedom in each spatial direction is (n-1) */
  /* this version requires n to be a power of 2 */

 if( argc < 2 ) {
    printf("need a problem size\n");
    return 1;
  }
  
 Real starttime, endtime, runtime, maxtime, mintime, temptime, avgruntime;
 Real timer1, timer2, timer3, timer4,timer5,timer6,timer7,timer8;
  int size, rank;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);


  n  = atoi(argv[1]);
  m  = n-1;
  nn = 4*n;
#ifdef HAVE_OPENMP
  numthreads = omp_get_max_threads();
#else
  numthreads = 1;
#endif
  
  
  // --- Distribute Work ---
  int *gsize;
  int *ssize;
  gsize = (int *)malloc(size*sizeof(int));
  ssize = (int *)malloc(size*sizeof(int));
  int remain = m % size;
  ssize[0] = 0;
  for (int q = 0;q<size;++q)
  {
    gsize[q] = m/size;
    if (q<remain) gsize[q]++;
    if (q<(size-1)) ssize[q+1] = ssize[q]+gsize[q];
  }
  // -----------------------
  

  diag = createRealArray (m);
  b    = createReal2DArray (gsize[rank],m);
  bt   = createReal2DArray (gsize[rank],m);
  z    = createReal2DArray (numthreads,nn); //GJØR NOE HER DA

  h    = 1./(Real)n;
  pi   = 4.*atan(1.);

  starttime = MPI_Wtime();

  #pragma omp parallel for schedule(static)
  for ( i=0; i < m; i++) {
    diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n));
  }
  
  #pragma omp parallel for private(i,f) schedule(static)
  for ( j=0; j < gsize[rank]; j++) {
    for ( i=0; i < m; i++) {

      f = 5*pi*pi*sin(pi*h*(j+ssize[rank]+1))*sin(2*pi*h*(i+1));
      //f = 1.0;
      //f =2*( ((j+1+ssize[rank])*h)*((j+1+ssize[rank])*h) + (i+1)*(i+1)*h*h  );
      
      b[j][i] = h*h*f;
      //b[j][i] = (j+ssize[rank])*m+i; transpose test
    }
  }
  timer1 = MPI_Wtime();

  
  
  // ---------- DEBUG PART ---------------
    /*if (rank == 0)
  {
    printVector(b[0],m*gsize[rank]);
  }
  */
  /*
  if(rank ==0)
  {
    printf("\n\n\n");
    printMatrix(b,m,gsize[rank]);
    printf("\n\n\n");
  }
  
    transpose (bt,b,&m,m,size,rank,gsize,ssize);
  
  
  if(rank ==0)
  {
    printMatrix(bt,m,gsize[rank]);
    printf("\n\n\n");
  }
  */
  // ----------- DEBUG END ---------------
  
  // Alternativt ha #ifdef utenfor loopen, og operere med 2 loops
  #pragma omp parallel for private(currentthread) schedule(static)
  for ( j=0; j < gsize[rank]; j++) {
    currentthread = 0;
    #ifdef HAVE_OPENMP
      currentthread = omp_get_thread_num();
    #endif
    fst_(b[j], &n, z[omp_get_thread_num()], &nn);
  }
  timer2 = MPI_Wtime();
/*
  if (rank == 0)
  {
    printf("\n\n\n");
    printMatrix(b,m);
  }
*/  

  transpose (bt,b,&m,m,size,rank,gsize,ssize);
  timer3 = MPI_Wtime();
/*
  if (rank == 0)
    {
      printf("\n\n\n");
      printMatrix(bt,m);
      printf("\n\n\n");
  }
*/

  

  #pragma omp parallel for private(currentthread) schedule(static)
  for ( i=0; i < gsize[rank]; i++) {
    currentthread = 0;
    #ifdef HAVE_OPENMP
      currentthread = omp_get_thread_num();
    #endif
    fstinv_(bt[i], &n, z[currentthread], &nn);
  }
  timer4 = MPI_Wtime();
  #pragma omp parallel for private(i) schedule(static)
  for ( j=0; j < gsize[rank]; j++) {
    for ( i=0; i < m; i++) {
      bt[j][i] = bt[j][i]/(diag[i]+diag[j+ssize[rank]]); //offset implemented
    }
  }
  timer5 = MPI_Wtime();
  #pragma omp parallel for private(currentthread) schedule(static)
  for ( i=0; i < gsize[rank]; i++) {
    currentthread = 0;
    #ifdef HAVE_OPENMP
      currentthread = omp_get_thread_num();
    #endif
    fst_(bt[i], &n, z[currentthread], &nn);
  }
  timer6 = MPI_Wtime();
  
  transpose (b,bt,&m,m,size,rank,gsize,ssize);
  timer7 = MPI_Wtime();
  #pragma omp parallel for private(currentthread) schedule(static)
  for ( j=0; j < gsize[rank]; j++) {
    currentthread = 0;
    #ifdef HAVE_OPENMP
      currentthread = omp_get_thread_num();
    #endif
    fstinv_(b[j], &n, z[currentthread], &nn);
  }
  timer8 = MPI_Wtime();
  endtime = MPI_Wtime();
  
  runtime = endtime-starttime;
  timer8 -=timer7;timer7 -=timer6;timer6 -=timer5;timer5 -=timer4;
  timer4 -=timer3;timer3 -=timer2;timer2 -=timer1;timer1 -=starttime;
  
  MPI_Allreduce(&runtime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  MPI_Allreduce(&runtime, &mintime, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&runtime, &avgruntime, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  avgruntime /= (Real) size;

  MPI_Allreduce(&timer1, &timer1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer2, &timer2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer3, &timer3, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer4, &timer4, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer5, &timer5, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer6, &timer6, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer7, &timer7, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&timer8, &timer8, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  timer1/=(Real)size;timer2/=(Real)size;timer3/=(Real)size;timer4/=(Real)size;
  timer5/=(Real)size;timer6/=(Real)size;timer7/=(Real)size;timer8/=(Real)size;

  if (rank == 0) printf("Slowest runtime: %e s\n", maxtime);
  if (rank == 0) printf("Fastest runtime: %e s\n", mintime);
  if (rank == 0) printf("Average runtime: %e s\n", avgruntime);
  
  if (rank == 0) printf("Init: \t\t %f p.c.\n", timer1/avgruntime);
  if (rank == 0) printf("FST1: \t\t %f p.c.\n", timer2/avgruntime);
  if (rank == 0) printf("Transpose1: \t %f p.c.\n", timer3/avgruntime);
  if (rank == 0) printf("FSTINV1: \t %f p.c.\n", timer4/avgruntime);
  if (rank == 0) printf("Diag: \t\t %f p.c.\n", timer5/avgruntime);
  if (rank == 0) printf("FST2: \t\t %f p.c.\n", timer6/avgruntime);
  if (rank == 0) printf("Transpose2: \t %f p.c.\n", timer7/avgruntime);
  if (rank == 0) printf("FSTINV2: \t %f p.c.\n", timer8/avgruntime);
  if (rank == 0) printf("\nFST TOTAL: \t\t %f p.c.\t%es\n",(timer2+timer4+timer6+timer8)/avgruntime,timer2+timer4+timer6+timer8);
  if (rank == 0) printf("\nTranspose TOTAL: \t %f p.c.\t%es\n",(timer3+timer7)/avgruntime,timer3+timer7);





  umax = 0.0;

 
  for ( j=0; j < gsize[rank]; j++) {
    for ( i=0; i < m; i++) {
      if (b[j][i] > umax) umax = b[j][i];
    }
  }
  
  
  
  
  MPI_Allreduce(&umax ,&umax , 1 ,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
  
  if (rank == 0) printf (" umax = %e \n",umax);
  
  //printVector(b[m/4],m);
  
  
  Real locerr, err;
  
  locerr = checkError( b,m, h,size, rank, gsize, ssize);
  
  MPI_Allreduce(&locerr ,&err , 1 ,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
  
  if (rank == 0) printf("\nMaximal error: %e\n", err);
  
  
  //printMatrix(b,m,m);
  
  /***********************************************************/
  /***********************************************************/
  /*                       FREE MEMORY                       */
  /***********************************************************/
  /***********************************************************/
  
  free(b[0]);
  free(bt[0]);
  free(b);
  free(bt);
  free(z[0]);
  free(z);
  free(diag);
  free(gsize);
  free(ssize);
  /***********************************************************/
  /***********************************************************/
  /***********************************************************/
  MPI_Finalize();
  
}
Esempio n. 9
0
int main(int argc, char **argv )
{
  Real *diag, **A, *z;
  Real pi, h, umax, globalumax, emax, globalemax, error, time;
  int i, j, n, m, nn, b, re, l, bb, bre, rank, size;

  MPI_Init (&argc, &argv);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Comm_size (MPI_COMM_WORLD, &size);

  /* the total number of grid points in each spatial direction is (n+1) */
  /* the total number of degrees-of-freedom in each spatial direction is (n-1) */
  /* this version requires n to be a power of 2 */

 if( argc < 2 ) {
    printf("need a problem size\n");
	return 0;
  }

  n  = atoi(argv[1]);
  m  = n-1;
  nn = 4*n;

  h    = 1./(Real)n;
  pi   = 4.*atan(1.);

  b   = floor(m/size);
  re  = m - (size-1)*b;
  l   = b;
  bb  = b*b;
  bre = b*re;

  if(rank+1 == size) {
    l   = re;
    bb  = bre;
    bre = re*re;
  }

  diag = createRealArray (m);
  A    = createReal2DArray (l,m);
  z    = createRealArray (nn);

  time = MPI_Wtime();

  for (i=0; i < m; i++) {
    diag[i] = 2.*(1.-cos((i+1)*pi/(Real)n));
  }

  for (j=0; j < l; j++) {
    for (i=0; i < m; i++) {
      //        h^2 * f(x,y)
      A[j][i] = h*h*5*pi*pi*sin(pi*i*h)*sin(2*pi*(j + rank*b)*h);
    }
  }
  
  for (j=0; j < l; j++) {
    fst_(A[j], &n, z, &nn);
  }

  transpose(A, l, m, size, bb, bre);

  for (i=0; i < l; i++) {
    fstinv_(A[i], &n, z, &nn);
  }  

  for (j=0; j < l; j++) {
    for (i=0; i < m; i++) {
      A[j][i] = A[j][i]/(diag[i]+diag[j + rank*b]);
    }
  }
  
  for (i=0; i < l; i++) {
    fst_(A[i], &n, z, &nn);
  }

  transpose(A, l, m, size, bb, bre);

  for (j=0; j < l; j++) {
    fstinv_(A[j], &n, z, &nn);
  }

  umax = 0.0;
  emax = 0.0;
  for (j=0; j < l; j++) {
    for (i=0; i < m; i++) {
      // error =  abs( numerical u(x,y) - exact u(x,y) )
      error = fabs(A[j][i] - sin(pi*i*h)*sin(2*pi*(j + rank*b)*h));
      if (A[j][i] > umax) umax = A[j][i];
      if (error > emax) emax = error;
    }
  }

  MPI_Reduce (&umax, &globalumax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
  MPI_Reduce (&emax, &globalemax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
  if (rank == 0)
  {
    printf("elapsed: %f\n", MPI_Wtime()-time);
    printf ("umax = %e \n",globalumax);
    printf ("emax = %e \n",globalemax);
  }

  MPI_Finalize();
  return 0;
}