void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int new_height = this->layer_param_.image_data_param().new_height();
  const int new_width  = this->layer_param_.image_data_param().new_width();
  const bool is_color  = this->layer_param_.image_data_param().is_color();
  string root_folder = this->layer_param_.image_data_param().root_folder();

  CHECK((new_height == 0 && new_width == 0) ||
      (new_height > 0 && new_width > 0)) << "Current implementation requires "
      "new_height and new_width to be set at the same time.";
  // Read the file with filenames and labels
  const string& source = this->layer_param_.image_data_param().source();
  LOG(INFO) << "Opening file " << source;
  std::ifstream infile(source.c_str());
  string filename;
  int label;
  while (infile >> filename >> label) {
    lines_.push_back(std::make_pair(filename, label));
  }

  if (this->layer_param_.image_data_param().shuffle()) {
    // randomly shuffle data
    LOG(INFO) << "Shuffling data";
    const unsigned int prefetch_rng_seed = caffe_rng_rand();
    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
    ShuffleImages();
  }
  LOG(INFO) << "A total of " << lines_.size() << " images.";

  lines_id_ = 0;
#ifdef USE_MPI
  //for multi GPU test,all gpu get one part of val dataset
  int rank,size=0;
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  lines_id_ = (lines_.size()/size)*rank;
#endif
  // Check if we would need to randomly skip a few data points
  if (this->layer_param_.image_data_param().rand_skip()) {
    unsigned int skip = caffe_rng_rand() %
        this->layer_param_.image_data_param().rand_skip();
    LOG(INFO) << "Skipping first " << skip << " data points.";
    CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
    lines_id_ += skip;
  }
  // Read an image, and use it to initialize the top blob.
  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
                                    new_height, new_width, is_color);
  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
  // Use data_transformer to infer the expected blob shape from a cv_image.
  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
  this->transformed_data_.Reshape(top_shape);
  // Reshape prefetch_data and top[0] according to the batch_size.
  const int batch_size = this->layer_param_.image_data_param().batch_size();
  CHECK_GT(batch_size, 0) << "Positive batch size required";
  top_shape[0] = batch_size;
  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    this->prefetch_[i].data_.Reshape(top_shape);
  }
  top[0]->Reshape(top_shape);

  LOG(INFO) << "output data size: " << top[0]->num() << ","
      << top[0]->channels() << "," << top[0]->height() << ","
      << top[0]->width();
  // label
  vector<int> label_shape(1, batch_size);
  top[1]->Reshape(label_shape);
  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    this->prefetch_[i].label_.Reshape(label_shape);
  }
}
Beispiel #2
0
int main(int argc,char **argv)
{
  PetscMPIInt    rank,size;
  PetscErrorCode ierr;
  PetscInt       M = 14,time_steps = 1000,w=1,s=1,localsize,j,i,mybase,myend;
  DA             da;
  PetscViewer    viewer;
  PetscDraw      draw;
  Vec            local,global,copy;
  PetscScalar    *localptr,*copyptr;
  PetscReal       h,k;
 
  ierr = PetscInitialize(&argc,&argv,(char*)0,help);CHKERRQ(ierr); 

  ierr = PetscOptionsGetInt(PETSC_NULL,"-M",&M,PETSC_NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetInt(PETSC_NULL,"-time",&time_steps,PETSC_NULL);CHKERRQ(ierr);
    
  /* Set up the array */ 
  ierr = DACreate1d(PETSC_COMM_WORLD,DA_NONPERIODIC,M,w,s,PETSC_NULL,&da);CHKERRQ(ierr);
  ierr = DACreateGlobalVector(da,&global);CHKERRQ(ierr);
  ierr = DACreateLocalVector(da,&local);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
  ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);

  /* Make copy of local array for doing updates */
  ierr = VecDuplicate(local,&copy);CHKERRQ(ierr);

  /* Set Up Display to Show Heat Graph */
  ierr = PetscViewerDrawOpen(PETSC_COMM_WORLD,0,"",80,480,500,160,&viewer);CHKERRQ(ierr);
  ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
  ierr = PetscDrawSetDoubleBuffer(draw);CHKERRQ(ierr);

  /* determine starting point of each processor */
  ierr = VecGetOwnershipRange(global,&mybase,&myend);CHKERRQ(ierr);

  /* Initialize the Array */
  ierr = VecGetLocalSize (local,&localsize);CHKERRQ(ierr);
  ierr = VecGetArray (local,&localptr);CHKERRQ(ierr);
  ierr = VecGetArray (copy,&copyptr);CHKERRQ(ierr);
  localptr[0] = copyptr[0] = 0.0;
  localptr[localsize-1] = copyptr[localsize-1] = 1.0;
  for (i=1; i<localsize-1; i++) {
    j=(i-1)+mybase; 
    localptr[i] = sin((PETSC_PI*j*6)/((PetscReal)M) 
                        + 1.2 * sin((PETSC_PI*j*2)/((PetscReal)M))) * 4+4;
  }

  ierr = VecRestoreArray(local,&localptr);CHKERRQ(ierr);
  ierr = VecRestoreArray(copy,&copyptr);CHKERRQ(ierr);
  ierr = DALocalToGlobal(da,local,INSERT_VALUES,global);CHKERRQ(ierr);

  /* Assign Parameters */
  h= 1.0/M; 
  k= h*h/2.2;

  for (j=0; j<time_steps; j++) {  

    /* Global to Local */
    ierr = DAGlobalToLocalBegin(da,global,INSERT_VALUES,local);CHKERRQ(ierr);
    ierr = DAGlobalToLocalEnd(da,global,INSERT_VALUES,local);CHKERRQ(ierr);

    /*Extract local array */ 
    ierr = VecGetArray(local,&localptr);CHKERRQ(ierr);
    ierr = VecGetArray (copy,&copyptr);CHKERRQ(ierr);

    /* Update Locally - Make array of new values */
    /* Note: I don't do anything for the first and last entry */
    for (i=1; i< localsize-1; i++) {
      copyptr[i] = localptr[i] + (k/(h*h)) *
                           (localptr[i+1]-2.0*localptr[i]+localptr[i-1]);
    }
  
    ierr = VecRestoreArray(copy,&copyptr);CHKERRQ(ierr);
    ierr = VecRestoreArray(local,&localptr);CHKERRQ(ierr);

    /* Local to Global */
    ierr = DALocalToGlobal(da,copy,INSERT_VALUES,global);CHKERRQ(ierr);
  
    /* View Wave */ 
    ierr = VecView(global,viewer);CHKERRQ(ierr);

  }

  ierr = PetscViewerDestroy(viewer);CHKERRQ(ierr);
  ierr = VecDestroy(copy);CHKERRQ(ierr);
  ierr = VecDestroy(local);CHKERRQ(ierr);
  ierr = VecDestroy(global);CHKERRQ(ierr);
  ierr = DADestroy(da);CHKERRQ(ierr);
  ierr = PetscFinalize();CHKERRQ(ierr);
  return 0;
}
Beispiel #3
0
int main (int argc, char *argv[])
{
   int myid, num_procs;
   int n, N, pi, pj, pk;
   double h;

   double tol, theta;
   int maxit, cycle_type;
   int rlx_type, rlx_sweeps, rlx_weight, rlx_omega;
   int amg_coarsen_type, amg_agg_levels, amg_rlx_type;
   int amg_interp_type, amg_Pmax;
   int singular_problem ;

   HYPRE_Int time_index;

   HYPRE_SStructGrid     edge_grid;
   HYPRE_SStructGraph    A_graph;
   HYPRE_SStructMatrix   A;
   HYPRE_SStructVector   b;
   HYPRE_SStructVector   x;
   HYPRE_SStructGrid     node_grid;
   HYPRE_SStructGraph    G_graph;
   HYPRE_SStructStencil  G_stencil[3];
   HYPRE_SStructMatrix   G;
   HYPRE_SStructVector   xcoord, ycoord, zcoord;

   HYPRE_Solver          solver, precond;

   /* Initialize MPI */
   MPI_Init(&argc, &argv);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);

   /* Set default parameters */
   n                = 10;
   optionAlpha      = 0;
   optionBeta       = 0;
   maxit            = 100;
   tol              = 1e-6;
   cycle_type       = 13;
   rlx_type         = 2;
   rlx_sweeps       = 1;
   rlx_weight       = 1.0;
   rlx_omega        = 1.0;
   amg_coarsen_type = 10;
   amg_agg_levels   = 1;
   amg_rlx_type     = 6;
   theta            = 0.25;
   amg_interp_type  = 6;
   amg_Pmax         = 4;
   singular_problem = 0;

   /* Parse command line */
   {
      int arg_index = 0;
      int print_usage = 0;

      while (arg_index < argc)
      {
         if ( strcmp(argv[arg_index], "-n") == 0 )
         {
            arg_index++;
            n = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-a") == 0 )
         {
            arg_index++;
            optionAlpha = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-b") == 0 )
         {
            arg_index++;
            optionBeta = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-maxit") == 0 )
         {
            arg_index++;
            maxit = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-tol") == 0 )
         {
            arg_index++;
            tol = atof(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-type") == 0 )
         {
            arg_index++;
            cycle_type = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-rlx") == 0 )
         {
            arg_index++;
            rlx_type = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-rlxn") == 0 )
         {
            arg_index++;
            rlx_sweeps = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-rlxw") == 0 )
         {
            arg_index++;
            rlx_weight = atof(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-rlxo") == 0 )
         {
            arg_index++;
            rlx_omega = atof(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-ctype") == 0 )
         {
            arg_index++;
            amg_coarsen_type = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-amgrlx") == 0 )
         {
            arg_index++;
            amg_rlx_type = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-agg") == 0 )
         {
            arg_index++;
            amg_agg_levels = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-itype") == 0 )
         {
            arg_index++;
            amg_interp_type = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-pmax") == 0 )
         {
            arg_index++;
            amg_Pmax = atoi(argv[arg_index++]);
         }
         else if ( strcmp(argv[arg_index], "-sing") == 0 )
         {
            arg_index++;
            singular_problem = 1;
         }
         else if ( strcmp(argv[arg_index], "-theta") == 0 )
         {
            arg_index++;
            theta = atof(argv[arg_index++]);
         }

         else if ( strcmp(argv[arg_index], "-help") == 0 )
         {
            print_usage = 1;
            break;
         }
         else
         {
            arg_index++;
         }
      }

      if ((print_usage) && (myid == 0))
      {
         printf("\n");
         printf("Usage: %s [<options>]\n", argv[0]);
         printf("\n");
         printf("  -n <n>              : problem size per processor (default: 10)\n");
         printf("  -a <alpha_opt>      : choice for the curl-curl coefficient (default: 1)\n");
         printf("  -b <beta_opt>       : choice for the mass coefficient (default: 1)\n");
         printf("\n");
         printf("PCG-AMS solver options:                                     \n");
         printf("  -maxit <num>        : maximum number of iterations (100)  \n");
         printf("  -tol <num>          : convergence tolerance (1e-6)        \n");
         printf("  -type <num>         : 3-level cycle type (0-8, 11-14)     \n");
         printf("  -theta <num>        : BoomerAMG threshold (0.25)          \n");
         printf("  -ctype <num>        : BoomerAMG coarsening type           \n");
         printf("  -agg <num>          : Levels of BoomerAMG agg. coarsening \n");
         printf("  -amgrlx <num>       : BoomerAMG relaxation type           \n");
         printf("  -itype <num>        : BoomerAMG interpolation type        \n");
         printf("  -pmax <num>         : BoomerAMG interpolation truncation  \n");
         printf("  -rlx <num>          : relaxation type                     \n");
         printf("  -rlxn <num>         : number of relaxation sweeps         \n");
         printf("  -rlxw <num>         : damping parameter (usually <=1)     \n");
         printf("  -rlxo <num>         : SOR parameter (usually in (0,2))    \n");
         printf("  -sing               : curl-curl only (singular) problem   \n");
         printf("\n");
         printf("\n");
      }

      if (print_usage)
      {
         MPI_Finalize();
         return (0);
      }
   }

   /* Figure out the processor grid (N x N x N).  The local problem size is n^3,
      while pi, pj and pk indicate the position in the processor grid. */
   N  = pow(num_procs,1.0/3.0) + 0.5;
   if (num_procs != N*N*N)
   {
      if (myid == 0) printf("Can't run on %d processors, try %d.\n",
                            num_procs, N*N*N);
      MPI_Finalize();
      exit(1);
   }
   h  = 1.0 / (N*n);
   pk = myid / (N*N);
   pj = myid/N - pk*N;
   pi = myid - pj*N - pk*N*N;

   /* Start timing */
   time_index = hypre_InitializeTiming("SStruct Setup");
   hypre_BeginTiming(time_index);

   /* 1. Set up the edge and nodal grids.  Note that we do this simultaneously
         to make sure that they have the same extents.  For simplicity we use
         only one part to represent the unit cube. */
   {
      HYPRE_Int ndim = 3;
      HYPRE_Int nparts = 1;

      /* Create empty 2D grid objects */
      HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &node_grid);
      HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &edge_grid);

      /* Set the extents of the grid - each processor sets its grid boxes. */
      {
         HYPRE_Int part = 0;
         HYPRE_Int ilower[3] = {1 + pi*n, 1 + pj*n, 1 + pk*n};
         HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};

         HYPRE_SStructGridSetExtents(node_grid, part, ilower, iupper);
         HYPRE_SStructGridSetExtents(edge_grid, part, ilower, iupper);
      }

      /* Set the variable type and number of variables on each grid. */
      {
         HYPRE_Int i;
         HYPRE_Int nnodevars = 1;
         HYPRE_Int nedgevars = 3;

         HYPRE_SStructVariable nodevars[1] = {HYPRE_SSTRUCT_VARIABLE_NODE};
         HYPRE_SStructVariable edgevars[3] = {HYPRE_SSTRUCT_VARIABLE_XEDGE,
                                              HYPRE_SSTRUCT_VARIABLE_YEDGE,
                                              HYPRE_SSTRUCT_VARIABLE_ZEDGE};
         for (i = 0; i < nparts; i++)
         {
            HYPRE_SStructGridSetVariables(node_grid, i, nnodevars, nodevars);
            HYPRE_SStructGridSetVariables(edge_grid, i, nedgevars, edgevars);
         }
      }

      /* Since there is only one part, there is no need to call the
         SetNeighborPart or SetSharedPart functions, which determine the spatial
         relation between the parts.  See Examples 12, 13 and 14 for
         illustrations of these calls. */

      /* Now the grids are ready to be used */
      HYPRE_SStructGridAssemble(node_grid);
      HYPRE_SStructGridAssemble(edge_grid);
   }

   /* 2. Create the finite element stiffness matrix A and load vector b. */
   {
      HYPRE_Int part = 0; /* this problem has only one part */

      /* Set the ordering of the variables in the finite element problem.  This
         is done by listing the variable offset directions relative to the
         element's center.  See the Reference Manual for more details. */
      {
         HYPRE_Int ordering[48] = { 0,  0, -1, -1,    /* x-edge [0]-[1] */
                                    1, +1,  0, -1,    /* y-edge [1]-[2] */
         /*     [7]------[6]  */    0,  0, +1, -1,    /* x-edge [3]-[2] */
         /*     /|       /|   */    1, -1,  0, -1,    /* y-edge [0]-[3] */
         /*    / |      / |   */    0,  0, -1, +1,    /* x-edge [4]-[5] */
         /*  [4]------[5] |   */    1, +1,  0, +1,    /* y-edge [5]-[6] */
         /*   | [3]----|-[2]  */    0,  0, +1, +1,    /* x-edge [7]-[6] */
         /*   | /      | /    */    1, -1,  0, +1,    /* y-edge [4]-[7] */
         /*   |/       |/     */    2, -1, -1,  0,    /* z-edge [0]-[4] */
         /*  [0]------[1]     */    2, +1, -1,  0,    /* z-edge [1]-[5] */
                                    2, +1, +1,  0,    /* z-edge [2]-[6] */
                                    2, -1, +1,  0 };  /* z-edge [3]-[7] */

         HYPRE_SStructGridSetFEMOrdering(edge_grid, part, ordering);
      }

      /* Set up the Graph - this determines the non-zero structure of the
         matrix. */
      {
         HYPRE_Int part = 0;

         /* Create the graph object */
         HYPRE_SStructGraphCreate(MPI_COMM_WORLD, edge_grid, &A_graph);

         /* See MatrixSetObjectType below */
         HYPRE_SStructGraphSetObjectType(A_graph, HYPRE_PARCSR);

         /* Indicate that this problem uses finite element stiffness matrices and
            load vectors, instead of stencils. */
         HYPRE_SStructGraphSetFEM(A_graph, part);

         /* The edge finite element matrix is full, so there is no need to call the
            HYPRE_SStructGraphSetFEMSparsity() function. */

         /* Assemble the graph */
         HYPRE_SStructGraphAssemble(A_graph);
      }

      /* Set up the SStruct Matrix and right-hand side vector */
      {
         /* Create the matrix object */
         HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, A_graph, &A);
         /* Use a ParCSR storage */
         HYPRE_SStructMatrixSetObjectType(A, HYPRE_PARCSR);
         /* Indicate that the matrix coefficients are ready to be set */
         HYPRE_SStructMatrixInitialize(A);

         /* Create an empty vector object */
         HYPRE_SStructVectorCreate(MPI_COMM_WORLD, edge_grid, &b);
         /* Use a ParCSR storage */
         HYPRE_SStructVectorSetObjectType(b, HYPRE_PARCSR);
         /* Indicate that the vector coefficients are ready to be set */
         HYPRE_SStructVectorInitialize(b);
      }

      /* Set the matrix and vector entries by finite element assembly */
      {
         /* local stiffness matrix and load vector */
         double S[12][12], F[12];

         int i, j, k;
         HYPRE_Int index[3];

         for (i = 1; i <= n; i++)
            for (j = 1; j <= n; j++)
               for (k = 1; k <= n; k++)
               {
                  /* Compute the FEM matrix and r.h.s. for cell (i,j,k) with
                     coefficients evaluated at the cell center. */
                  index[0] = i + pi*n; index[1] = j + pj*n; index[2] = k + pk*n;
                  ComputeFEMND1(S,F,(pi*n+i)*h-h/2,(pj*n+j)*h-h/2,(pk*n+k)*h-h/2,h);

                  /* Eliminate boundary conditions on x = 0 */
                  if (index[0] == 1)
                  {
                     int ii, jj, bc_edges[4] = { 3, 11, 7, 8 };
                     for (ii = 0; ii < 4; ii++)
                     {
                        for (jj = 0; jj < 12; jj++)
                           S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0;
                        S[bc_edges[ii]][bc_edges[ii]] = 1.0;
                        F[bc_edges[ii]] = 0.0;
                     }
                  }
                  /* Eliminate boundary conditions on y = 0 */
                  if (index[1] == 1)
                  {
                     int ii, jj, bc_edges[4] = { 0, 9, 4, 8 };
                     for (ii = 0; ii < 4; ii++)
                     {
                        for (jj = 0; jj < 12; jj++)
                           S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0;
                        S[bc_edges[ii]][bc_edges[ii]] = 1.0;
                        F[bc_edges[ii]] = 0.0;
                     }
                  }
                  /* Eliminate boundary conditions on z = 0 */
                  if (index[2] == 1)
                  {
                     int ii, jj, bc_edges[4] = { 0, 1, 2, 3 };
                     for (ii = 0; ii < 4; ii++)
                     {
                        for (jj = 0; jj < 12; jj++)
                           S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0;
                        S[bc_edges[ii]][bc_edges[ii]] = 1.0;
                        F[bc_edges[ii]] = 0.0;
                     }
                  }
                  /* Eliminate boundary conditions on x = 1 */
                  if (index[0] == N*n)
                  {
                     int ii, jj, bc_edges[4] = { 1, 10, 5, 9 };
                     for (ii = 0; ii < 4; ii++)
                     {
                        for (jj = 0; jj < 12; jj++)
                           S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0;
                        S[bc_edges[ii]][bc_edges[ii]] = 1.0;
                        F[bc_edges[ii]] = 0.0;
                     }
                  }
                  /* Eliminate boundary conditions on y = 1 */
                  if (index[1] == N*n)
                  {
                     int ii, jj, bc_edges[4] = { 2, 10, 6, 11 };
                     for (ii = 0; ii < 4; ii++)
                     {
                        for (jj = 0; jj < 12; jj++)
                           S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0;
                        S[bc_edges[ii]][bc_edges[ii]] = 1.0;
                        F[bc_edges[ii]] = 0.0;
                     }
                  }
                  /* Eliminate boundary conditions on z = 1 */
                  if (index[2] == N*n)
                  {
                     int ii, jj, bc_edges[4] = { 4, 5, 6, 7 };
                     for (ii = 0; ii < 4; ii++)
                     {
                        for (jj = 0; jj < 12; jj++)
                           S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0;
                        S[bc_edges[ii]][bc_edges[ii]] = 1.0;
                        F[bc_edges[ii]] = 0.0;
                     }
                  }

                  /* Assemble the matrix */
                  HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]);

                  /* Assemble the vector */
                  HYPRE_SStructVectorAddFEMValues(b, part, index, F);
               }
      }

      /* Collective calls finalizing the matrix and vector assembly */
      HYPRE_SStructMatrixAssemble(A);
      HYPRE_SStructVectorAssemble(b);
   }

   /* 3. Create the discrete gradient matrix G, which is needed in AMS. */
   {
      HYPRE_Int part = 0;
      HYPRE_Int stencil_size = 2;

      /* Define the discretization stencil relating the edges and nodes of the
         grid. */
      {
         HYPRE_Int ndim = 3;
         HYPRE_Int entry;
         HYPRE_Int var = 0; /* the node variable */

         /* The discrete gradient stencils connect edge to node variables. */
         HYPRE_Int Gx_offsets[2][3] = {{-1,0,0},{0,0,0}};  /* x-edge [7]-[6] */
         HYPRE_Int Gy_offsets[2][3] = {{0,-1,0},{0,0,0}};  /* y-edge [5]-[6] */
         HYPRE_Int Gz_offsets[2][3] = {{0,0,-1},{0,0,0}};  /* z-edge [2]-[6] */

         HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[0]);
         HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[1]);
         HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[2]);

         for (entry = 0; entry < stencil_size; entry++)
         {
            HYPRE_SStructStencilSetEntry(G_stencil[0], entry, Gx_offsets[entry], var);
            HYPRE_SStructStencilSetEntry(G_stencil[1], entry, Gy_offsets[entry], var);
            HYPRE_SStructStencilSetEntry(G_stencil[2], entry, Gz_offsets[entry], var);
         }
      }

      /* Set up the Graph - this determines the non-zero structure of the
         matrix. */
      {
         HYPRE_Int nvars = 3;
         HYPRE_Int var; /* the edge variables */

         /* Create the discrete gradient graph object */
         HYPRE_SStructGraphCreate(MPI_COMM_WORLD, edge_grid, &G_graph);

         /* See MatrixSetObjectType below */
         HYPRE_SStructGraphSetObjectType(G_graph, HYPRE_PARCSR);

         /* Since the discrete gradient relates edge and nodal variables (it is a
            rectangular matrix), we have to specify the domain (column) grid. */
         HYPRE_SStructGraphSetDomainGrid(G_graph, node_grid);

         /* Tell the graph which stencil to use for each edge variable on each
            part (we only have one part). */
         for (var = 0; var < nvars; var++)
            HYPRE_SStructGraphSetStencil(G_graph, part, var, G_stencil[var]);

         /* Assemble the graph */
         HYPRE_SStructGraphAssemble(G_graph);
      }

      /* Set up the SStruct Matrix */
      {
         /* Create the matrix object */
         HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, G_graph, &G);
         /* Use a ParCSR storage */
         HYPRE_SStructMatrixSetObjectType(G, HYPRE_PARCSR);
         /* Indicate that the matrix coefficients are ready to be set */
         HYPRE_SStructMatrixInitialize(G);
      }

      /* Set the discrete gradient values, assuming a "natural" orientation of
         the edges (i.e. one in agreement with the coordinate directions). */
      {
         int i;
         int nedges = n*(n+1)*(n+1);
         double *values;
         HYPRE_Int stencil_indices[2] = {0,1}; /* the nodes of each edge */

         values = (double*) calloc(2*nedges, sizeof(double));

         /* The edge orientation is fixed: from first to second node */
         for (i = 0; i < nedges; i++)
         {
            values[2*i]   = -1.0;
            values[2*i+1] =  1.0;
         }

         /* Set the values in the discrete gradient x-edges */
         {
            HYPRE_Int var = 0;
            HYPRE_Int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n};
            HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};
            HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var,
                                            stencil_size, stencil_indices,
                                            values);
         }
         /* Set the values in the discrete gradient y-edges */
         {
            HYPRE_Int var = 1;
            HYPRE_Int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n};
            HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};
            HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var,
                                            stencil_size, stencil_indices,
                                            values);
         }
         /* Set the values in the discrete gradient z-edges */
         {
            HYPRE_Int var = 2;
            HYPRE_Int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n};
            HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};
            HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var,
                                            stencil_size, stencil_indices,
                                            values);
         }

         free(values);
      }

      /* Finalize the matrix assembly */
      HYPRE_SStructMatrixAssemble(G);
   }

   /* 4. Create the vectors of nodal coordinates xcoord, ycoord and zcoord,
         which are needed in AMS. */
   {
      int i, j, k;
      HYPRE_Int part = 0;
      HYPRE_Int var = 0; /* the node variable */
      HYPRE_Int index[3];
      double xval, yval, zval;

      /* Create empty vector objects */
      HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &xcoord);
      HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &ycoord);
      HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &zcoord);
      /* Set the object type to ParCSR */
      HYPRE_SStructVectorSetObjectType(xcoord, HYPRE_PARCSR);
      HYPRE_SStructVectorSetObjectType(ycoord, HYPRE_PARCSR);
      HYPRE_SStructVectorSetObjectType(zcoord, HYPRE_PARCSR);
      /* Indicate that the vector coefficients are ready to be set */
      HYPRE_SStructVectorInitialize(xcoord);
      HYPRE_SStructVectorInitialize(ycoord);
      HYPRE_SStructVectorInitialize(zcoord);

      /* Compute and set the coordinates of the nodes */
      for (i = 0; i <= n; i++)
         for (j = 0; j <= n; j++)
            for (k = 0; k <= n; k++)
            {
               index[0] = i + pi*n; index[1] = j + pj*n; index[2] = k + pk*n;

               xval = index[0]*h;
               yval = index[1]*h;
               zval = index[2]*h;

               HYPRE_SStructVectorSetValues(xcoord, part, index, var, &xval);
               HYPRE_SStructVectorSetValues(ycoord, part, index, var, &yval);
               HYPRE_SStructVectorSetValues(zcoord, part, index, var, &zval);
            }

      /* Finalize the vector assembly */
      HYPRE_SStructVectorAssemble(xcoord);
      HYPRE_SStructVectorAssemble(ycoord);
      HYPRE_SStructVectorAssemble(zcoord);
   }

   /* 5. Set up a SStruct Vector for the solution vector x */
   {
      HYPRE_Int part = 0;
      int nvalues = n*(n+1)*(n+1);
      double *values;

      values = (double*) calloc(nvalues, sizeof(double));

      /* Create an empty vector object */
      HYPRE_SStructVectorCreate(MPI_COMM_WORLD, edge_grid, &x);
      /* Set the object type to ParCSR */
      HYPRE_SStructVectorSetObjectType(x, HYPRE_PARCSR);
      /* Indicate that the vector coefficients are ready to be set */
      HYPRE_SStructVectorInitialize(x);

      /* Set the values for the initial guess x-edge */
      {
         HYPRE_Int var = 0;
         HYPRE_Int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n};
         HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};
         HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values);
      }
      /* Set the values for the initial guess y-edge */
      {
         HYPRE_Int var = 1;
         HYPRE_Int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n};
         HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};
         HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values);
      }
      /* Set the values for the initial guess z-edge */
      {
         HYPRE_Int var = 2;
         HYPRE_Int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n};
         HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n};
         HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values);
      }

      free(values);

      /* Finalize the vector assembly */
      HYPRE_SStructVectorAssemble(x);
   }

   /* Finalize current timing */
   hypre_EndTiming(time_index);
   hypre_PrintTiming("SStruct phase times", MPI_COMM_WORLD);
   hypre_FinalizeTiming(time_index);
   hypre_ClearTiming();

   /* 6. Set up and call the PCG-AMS solver (Solver options can be found in the
         Reference Manual.) */
   {
      double final_res_norm;
      HYPRE_Int its;

      HYPRE_ParCSRMatrix    par_A;
      HYPRE_ParVector       par_b;
      HYPRE_ParVector       par_x;

      HYPRE_ParCSRMatrix    par_G;
      HYPRE_ParVector       par_xcoord;
      HYPRE_ParVector       par_ycoord;
      HYPRE_ParVector       par_zcoord;

      /* Extract the ParCSR objects needed in the solver */
      HYPRE_SStructMatrixGetObject(A, (void **) &par_A);
      HYPRE_SStructVectorGetObject(b, (void **) &par_b);
      HYPRE_SStructVectorGetObject(x, (void **) &par_x);
      HYPRE_SStructMatrixGetObject(G, (void **) &par_G);
      HYPRE_SStructVectorGetObject(xcoord, (void **) &par_xcoord);
      HYPRE_SStructVectorGetObject(ycoord, (void **) &par_ycoord);
      HYPRE_SStructVectorGetObject(zcoord, (void **) &par_zcoord);

      if (myid == 0)
         printf("Problem size: %lld\n\n",
             hypre_ParCSRMatrixGlobalNumRows((hypre_ParCSRMatrix*)par_A));

      /* Start timing */
      time_index = hypre_InitializeTiming("AMS Setup");
      hypre_BeginTiming(time_index);

      /* Create solver */
      HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver);

      /* Set some parameters (See Reference Manual for more parameters) */
      HYPRE_PCGSetMaxIter(solver, maxit); /* max iterations */
      HYPRE_PCGSetTol(solver, tol); /* conv. tolerance */
      HYPRE_PCGSetTwoNorm(solver, 0); /* use the two norm as the stopping criteria */
      HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */
      HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */

      /* Create AMS preconditioner */
      HYPRE_AMSCreate(&precond);

      /* Set AMS parameters */
      HYPRE_AMSSetMaxIter(precond, 1);
      HYPRE_AMSSetTol(precond, 0.0);
      HYPRE_AMSSetCycleType(precond, cycle_type);
      HYPRE_AMSSetPrintLevel(precond, 1);

      /* Set discrete gradient */
      HYPRE_AMSSetDiscreteGradient(precond, par_G);

      /* Set vertex coordinates */
      HYPRE_AMSSetCoordinateVectors(precond,
                                    par_xcoord, par_ycoord, par_zcoord);

      if (singular_problem)
         HYPRE_AMSSetBetaPoissonMatrix(precond, NULL);

      /* Smoothing and AMG options */
      HYPRE_AMSSetSmoothingOptions(precond,
                                   rlx_type, rlx_sweeps,
                                   rlx_weight, rlx_omega);
      HYPRE_AMSSetAlphaAMGOptions(precond,
                                  amg_coarsen_type, amg_agg_levels,
                                  amg_rlx_type, theta, amg_interp_type,
                                  amg_Pmax);
      HYPRE_AMSSetBetaAMGOptions(precond,
                                 amg_coarsen_type, amg_agg_levels,
                                 amg_rlx_type, theta, amg_interp_type,
                                 amg_Pmax);

      /* Set the PCG preconditioner */
      HYPRE_PCGSetPrecond(solver,
                          (HYPRE_PtrToSolverFcn) HYPRE_AMSSolve,
                          (HYPRE_PtrToSolverFcn) HYPRE_AMSSetup,
                          precond);

      /* Call the setup */
      HYPRE_ParCSRPCGSetup(solver, par_A, par_b, par_x);

      /* Finalize current timing */
      hypre_EndTiming(time_index);
      hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD);
      hypre_FinalizeTiming(time_index);
      hypre_ClearTiming();

      /* Start timing again */
      time_index = hypre_InitializeTiming("AMS Solve");
      hypre_BeginTiming(time_index);

      /* Call the solve */
      HYPRE_ParCSRPCGSolve(solver, par_A, par_b, par_x);

      /* Finalize current timing */
      hypre_EndTiming(time_index);
      hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD);
      hypre_FinalizeTiming(time_index);
      hypre_ClearTiming();

      /* Get some info */
      HYPRE_PCGGetNumIterations(solver, &its);
      HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm);

      /* Clean up */
      HYPRE_AMSDestroy(precond);
      HYPRE_ParCSRPCGDestroy(solver);

      /* Gather the solution vector */
      HYPRE_SStructVectorGather(x);

      if (myid == 0)
      {
         printf("\n");
         printf("Iterations = %lld\n", its);
         printf("Final Relative Residual Norm = %g\n", final_res_norm);
         printf("\n");
      }
   }

   /* Free memory */
   HYPRE_SStructGridDestroy(edge_grid);
   HYPRE_SStructGraphDestroy(A_graph);
   HYPRE_SStructMatrixDestroy(A);
   HYPRE_SStructVectorDestroy(b);
   HYPRE_SStructVectorDestroy(x);
   HYPRE_SStructGridDestroy(node_grid);
   HYPRE_SStructGraphDestroy(G_graph);
   HYPRE_SStructStencilDestroy(G_stencil[0]);
   HYPRE_SStructStencilDestroy(G_stencil[1]);
   HYPRE_SStructStencilDestroy(G_stencil[2]);
   HYPRE_SStructMatrixDestroy(G);
   HYPRE_SStructVectorDestroy(xcoord);
   HYPRE_SStructVectorDestroy(ycoord);
   HYPRE_SStructVectorDestroy(zcoord);

   /* Finalize MPI */
   MPI_Finalize();

   return 0;
}
Beispiel #4
0
int main(int argc,char** argv)
{
   int          taskid, ntasks;
   MPI_Status   status;
   int          ierr,i,j,itask;
   int	        buffsize;
   double       **sendbuff,*recvbuff,buffsum,buffsums[1024];
   double       inittime,totaltime,recvtime,recvtimes[1024];
   
   /*===============================================================*/
   /* MPI Initialisation. Its important to put this call at the     */
   /* begining of the program, after variable declarations.         */
   MPI_Init(&argc, &argv);

   /*===============================================================*/
   /* Get the number of MPI tasks and the taskid of this task.      */
   MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
   MPI_Comm_size(MPI_COMM_WORLD,&ntasks);

   /*===============================================================*/
   /* Get buffsize value from program arguments.                    */
   buffsize=atoi(argv[1]);

   /*===============================================================*/
   /* Printing out the description of the example.                  */
   if ( taskid == 0 ){
     printf("\n\n\n");
     printf("##########################################################\n\n");
     printf(" Example 4 \n\n");
     printf(" Point-to-point Communication: MPI_Send MPI_Recv \n\n");
     printf(" Vector size: %d\n",buffsize);
     printf(" Number of tasks: %d\n\n",ntasks);
     printf("##########################################################\n\n");
     printf("                --> BEFORE COMMUNICATION <--\n\n");
   }

  
   if ( taskid == 0 ){
     /*=============================================================*/
     /* Memory allocation.                                          */ 
     sendbuff=(double **)malloc(sizeof(double *)*ntasks);
     sendbuff[0]=(double *)malloc(sizeof(double)*ntasks*buffsize);
     for(i=1;i<ntasks;i++)sendbuff[i]=sendbuff[i-1]+buffsize;

     /*=============================================================*/
     /* Vectors and/or matrices initalisation.                      */
     srand((unsigned)time( NULL ) + taskid);
     for(itask=0;itask<ntasks;itask++){
       for(i=0;i<buffsize;i++){
         sendbuff[itask][i]=(double)rand()/RAND_MAX;
       }
     }

     /*==============================================================*/
     /* Print out before communication.                              */
     
     for(itask=1;itask<ntasks;itask++){
       buffsum=0.0;
       for(i=0;i<buffsize;i++){
         buffsum=buffsum+sendbuff[itask][i];
       }
       printf("Task %d : Sum of vector sent to %d -> %e \n",
               taskid,itask,buffsum);
       
     }  
     
   }
   else{

     /*=============================================================*/
     /* Memory allocation.                                          */ 
     recvbuff=(double *)malloc(sizeof(double)*buffsize);
     
   }

   /*===============================================================*/
   /* Communication.                                                */

   inittime = MPI_Wtime();

   if ( taskid == 0 ){

     for(itask=1 ; itask<ntasks ; itask++){
                  
       ierr = MPI_Send(sendbuff[itask],
                       buffsize,
                       MPI_DOUBLE,
	               itask,
	               0,
                       MPI_COMM_WORLD);
     } 

   }
   else{
   
     ierr = MPI_Recv(recvbuff,
                     buffsize,
                     MPI_DOUBLE,
                     0,
                     MPI_ANY_TAG,
                     MPI_COMM_WORLD,
                     &status);
     
     recvtime = MPI_Wtime();
     
     buffsum=0.0;
     for(i=0 ; i<buffsize ; i++){
       buffsum=buffsum+recvbuff[i];
     }
        
   }
   
   MPI_Barrier(MPI_COMM_WORLD);
   
   totaltime = MPI_Wtime() - inittime;

   /*===============================================================*/
   /* Print out after communication.                                */
   
   ierr=MPI_Gather(&recvtime,1,MPI_DOUBLE,
                   recvtimes,1, MPI_DOUBLE,
                   0,MPI_COMM_WORLD);
                   
   ierr=MPI_Gather(&buffsum,1,MPI_DOUBLE,
                   buffsums,1, MPI_DOUBLE,
                   0,MPI_COMM_WORLD);

   if(taskid==0){
     printf("\n");
     printf("##########################################################\n\n");
     printf("                --> AFTER COMMUNICATION <-- \n\n");
     for(itask=1;itask<ntasks;itask++){
       printf("Task %d : Vector received at %f seconds : Sum= %e\n",
               itask,recvtimes[itask],buffsums[itask]);
     }  
     printf("\n");
     printf("##########################################################\n\n");
     printf(" Communication time : %f seconds\n\n",totaltime);  
     printf("##########################################################\n\n");
   }

   /*===============================================================*/
   /* Free the allocated memory.                                    */
   if ( taskid == 0 ){
     free(sendbuff[0]);
     free(sendbuff);
   }
   else{
     free(recvbuff);
   }

   /*===============================================================*/
   /* MPI finalisation.                                             */
   MPI_Finalize();

}
Beispiel #5
0
int main(int argc, char *argv[])
{
  FILE *parameterfile = NULL;
  int j, i, ix = 0, isample = 0, op_id = 0;
  char datafilename[206];
  char parameterfilename[206];
  char conf_filename[50];
  char * input_filename = NULL;
  char * filename = NULL;
  double plaquette_energy;
  struct stout_parameters params_smear;
  spinor **s, *s_;

#ifdef _KOJAK_INST
#pragma pomp inst init
#pragma pomp inst begin(main)
#endif

#if (defined SSE || defined SSE2 || SSE3)
  signal(SIGILL, &catch_ill_inst);
#endif

  DUM_DERI = 8;
  DUM_MATRIX = DUM_DERI + 5;
#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR)
  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
#else
  NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
#endif

  verbose = 0;
  g_use_clover_flag = 0;

#ifdef MPI

#  ifdef OMP
  int mpi_thread_provided;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
#  else
  MPI_Init(&argc, &argv);
#  endif

  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
#else
  g_proc_id = 0;
#endif

  process_args(argc,argv,&input_filename,&filename);
  set_default_filenames(&input_filename, &filename);

  /* Read the input file */
  if( (j = read_input(input_filename)) != 0) {
    fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename);
    exit(-1);
  }

#ifdef OMP
  init_openmp();
#endif

  /* this DBW2 stuff is not needed for the inversion ! */
  if (g_dflgcr_flag == 1) {
    even_odd_flag = 0;
  }
  g_rgi_C1 = 0;
  if (Nsave == 0) {
    Nsave = 1;
  }

  if (g_running_phmc) {
    NO_OF_SPINORFIELDS = DUM_MATRIX + 8;
  }

  tmlqcd_mpi_init(argc, argv);

  g_dbw2rand = 0;

  /* starts the single and double precision random number */
  /* generator                                            */
  start_ranlux(rlxd_level, random_seed);

  /* we need to make sure that we don't have even_odd_flag = 1 */
  /* if any of the operators doesn't use it                    */
  /* in this way even/odd can still be used by other operators */
  for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0;

#ifndef MPI
  g_dbw2rand = 0;
#endif

#ifdef _GAUGE_COPY
  j = init_gauge_field(VOLUMEPLUSRAND, 1);
#else
  j = init_gauge_field(VOLUMEPLUSRAND, 0);
#endif
  if (j != 0) {
    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
    exit(-1);
  }
  j = init_geometry_indices(VOLUMEPLUSRAND);
  if (j != 0) {
    fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n");
    exit(-1);
  }
  if (no_monomials > 0) {
    if (even_odd_flag) {
      j = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag);
    }
    else {
      j = init_monomials(VOLUMEPLUSRAND, even_odd_flag);
    }
    if (j != 0) {
      fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n");
      exit(-1);
    }
  }
  if (even_odd_flag) {
    j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
  }
  else {
    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
  }
  if (j != 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(-1);
  }

  if (g_running_phmc) {
    j = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20);
    if (j != 0) {
      fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n");
      exit(-1);
    }
  }

  g_mu = g_mu1;

  if (g_cart_id == 0) {
    /*construct the filenames for the observables and the parameters*/
    strncpy(datafilename, filename, 200);
    strcat(datafilename, ".data");
    strncpy(parameterfilename, filename, 200);
    strcat(parameterfilename, ".para");

    parameterfile = fopen(parameterfilename, "w");
    write_first_messages(parameterfile, "invert", git_hash);
    fclose(parameterfile);
  }

  /* define the geometry */
  geometry();

  /* define the boundary conditions for the fermion fields */
  boundary(g_kappa);

  phmc_invmaxev = 1.;

  init_operators();

  /* list and initialize measurements*/
  if(g_proc_id == 0) {
    printf("\n");
    for(int j = 0; j < no_measurements; j++) {
      printf("# measurement id %d, type = %d\n", j, measurement_list[j].type);
    }
  }
  init_measurements();  

  /* this could be maybe moved to init_operators */
#ifdef _USE_HALFSPINOR
  j = init_dirac_halfspinor();
  if (j != 0) {
    fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
    exit(-1);
  }
  if (g_sloppy_precision_flag == 1) {
    j = init_dirac_halfspinor32();
    if (j != 0)
    {
      fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
      exit(-1);
    }
  }
#  if (defined _PERSISTENT)
  if (even_odd_flag)
    init_xchange_halffield();
#  endif
#endif

  for (j = 0; j < Nmeas; j++) {
    sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore);
    if (g_cart_id == 0) {
      printf("#\n# Trying to read gauge field from file %s in %s precision.\n",
            conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
      fflush(stdout);
    }
    if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) {
      fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename);
      exit(-2);
    }


    if (g_cart_id == 0) {
      printf("# Finished reading gauge field.\n");
      fflush(stdout);
    }
#ifdef MPI
    xchange_gauge(g_gauge_field);
#endif

    /*compute the energy of the gauge field*/
    plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);

    if (g_cart_id == 0) {
      printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc));
      fflush(stdout);
    }

    if (use_stout_flag == 1){
      params_smear.rho = stout_rho;
      params_smear.iterations = stout_no_iter;
/*       if (stout_smear((su3_tuple*)(g_gauge_field[0]), &params_smear, (su3_tuple*)(g_gauge_field[0])) != 0) */
/*         exit(1) ; */
      g_update_gauge_copy = 1;
      plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);

      if (g_cart_id == 0) {
        printf("# The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc));
        fflush(stdout);
      }
    }

    /* if any measurements are defined in the input file, do them here */
    measurement * meas;
    for(int imeas = 0; imeas < no_measurements; imeas++){
      meas = &measurement_list[imeas];
      if (g_proc_id == 0) {
        fprintf(stdout, "#\n# Beginning online measurement.\n");
      }
      meas->measurefunc(nstore, imeas, even_odd_flag);
    }

    if (reweighting_flag == 1) {
      reweighting_factor(reweighting_samples, nstore);
    }

    /* Compute minimal eigenvalues, if wanted */
    if (compute_evs != 0) {
      eigenvalues(&no_eigenvalues, 5000, eigenvalue_precision,
                  0, compute_evs, nstore, even_odd_flag);
    }
    if (phmc_compute_evs != 0) {
#ifdef MPI
      MPI_Finalize();
#endif
      return(0);
    }

    /* Compute the mode number or topological susceptibility using spectral projectors, if wanted*/

    if(compute_modenumber != 0 || compute_topsus !=0){
      
      s_ = calloc(no_sources_z2*VOLUMEPLUSRAND+1, sizeof(spinor));
      s  = calloc(no_sources_z2, sizeof(spinor*));
      if(s_ == NULL) { 
	printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); 
      }
      if(s == NULL) { 
	printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); 
      }
      
      
      for(i = 0; i < no_sources_z2; i++) {
#if (defined SSE3 || defined SSE2 || defined SSE)
        s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND;
#else
        s[i] = s_+i*VOLUMEPLUSRAND;
#endif
	
        random_spinor_field_lexic(s[i], reproduce_randomnumber_flag,RN_Z2);
	
/* 	what is this here needed for?? */
/*         spinor *aux_,*aux; */
/* #if ( defined SSE || defined SSE2 || defined SSE3 ) */
/*         aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); */
/*         aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); */
/* #else */
/*         aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); */
/*         aux = aux_; */
/* #endif */
	
        if(g_proc_id == 0) {
          printf("source %d \n", i);
        }
	
        if(compute_modenumber != 0){
          mode_number(s[i], mstarsq);
        }
	
        if(compute_topsus !=0) {
          top_sus(s[i], mstarsq);
        }
      }
      free(s);
      free(s_);
    }


    /* move to operators as well */
    if (g_dflgcr_flag == 1) {
      /* set up deflation blocks */
      init_blocks(nblocks_t, nblocks_x, nblocks_y, nblocks_z);

      /* the can stay here for now, but later we probably need */
      /* something like init_dfl_solver called somewhere else  */
      /* create set of approximate lowest eigenvectors ("global deflation subspace") */

      /*       g_mu = 0.; */
      /*       boundary(0.125); */
      generate_dfl_subspace(g_N_s, VOLUME, reproduce_randomnumber_flag);
      /*       boundary(g_kappa); */
      /*       g_mu = g_mu1; */

      /* Compute little Dirac operators */
      /*       alt_block_compute_little_D(); */
      if (g_debug_level > 0) {
        check_projectors(reproduce_randomnumber_flag);
        check_local_D(reproduce_randomnumber_flag);
      }
      if (g_debug_level > 1) {
        check_little_D_inversion(reproduce_randomnumber_flag);
      }

    }
    if(SourceInfo.type == 1) {
      index_start = 0;
      index_end = 1;
    }

    g_precWS=NULL;
    if(use_preconditioning == 1){
      /* todo load fftw wisdom */
#if (defined HAVE_FFTW ) && !( defined MPI)
      loadFFTWWisdom(g_spinor_field[0],g_spinor_field[1],T,LX);
#else
      use_preconditioning=0;
#endif
    }

    if (g_cart_id == 0) {
      fprintf(stdout, "#\n"); /*Indicate starting of the operator part*/
    }
    for(op_id = 0; op_id < no_operators; op_id++) {
      boundary(operator_list[op_id].kappa);
      g_kappa = operator_list[op_id].kappa; 
      g_mu = 0.;

      if(use_preconditioning==1 && PRECWSOPERATORSELECT[operator_list[op_id].solver]!=PRECWS_NO ){
        printf("# Using preconditioning with treelevel preconditioning operator: %s \n",
              precWSOpToString(PRECWSOPERATORSELECT[operator_list[op_id].solver]));
        /* initial preconditioning workspace */
        operator_list[op_id].precWS=(spinorPrecWS*)malloc(sizeof(spinorPrecWS));
        spinorPrecWS_Init(operator_list[op_id].precWS,
                  operator_list[op_id].kappa,
                  operator_list[op_id].mu/2./operator_list[op_id].kappa,
                  -(0.5/operator_list[op_id].kappa-4.),
                  PRECWSOPERATORSELECT[operator_list[op_id].solver]);
        g_precWS = operator_list[op_id].precWS;

        if(PRECWSOPERATORSELECT[operator_list[op_id].solver] == PRECWS_D_DAGGER_D) {
          fitPrecParams(op_id);
        }
      }

      for(isample = 0; isample < no_samples; isample++) {
        for (ix = index_start; ix < index_end; ix++) {
          if (g_cart_id == 0) {
            fprintf(stdout, "#\n"); /*Indicate starting of new index*/
          }
          /* we use g_spinor_field[0-7] for sources and props for the moment */
          /* 0-3 in case of 1 flavour  */
          /* 0-7 in case of 2 flavours */
          prepare_source(nstore, isample, ix, op_id, read_source_flag, source_location);
          //randmize initial guess for eigcg if needed-----experimental
          if( (operator_list[op_id].solver == INCREIGCG) && (operator_list[op_id].solver_params.eigcg_rand_guess_opt) ){ //randomize the initial guess
              gaussian_volume_source( operator_list[op_id].prop0, operator_list[op_id].prop1,isample,ix,0); //need to check this
          } 
          operator_list[op_id].inverter(op_id, index_start, 1);
        }
      }


      if(use_preconditioning==1 && operator_list[op_id].precWS!=NULL ){
        /* free preconditioning workspace */
        spinorPrecWS_Free(operator_list[op_id].precWS);
        free(operator_list[op_id].precWS);
      }

      if(operator_list[op_id].type == OVERLAP){
        free_Dov_WS();
      }

    }
    nstore += Nsave;
  }

#ifdef OMP
  free_omp_accumulators();
#endif
  free_blocks();
  free_dfl_subspace();
  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_moment_field();
  free_chi_spinor_field();
  free(filename);
  free(input_filename);
#ifdef MPI
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
#endif
  return(0);
#ifdef _KOJAK_INST
#pragma pomp inst end(main)
#endif
}
Beispiel #6
0
   /*!
   \brief Create header of netcdf file
   There are some information to fill in header of each netcdf.
   */
   void CFile::createHeader(void)
   {
      CContext* context = CContext::getCurrent();
      CContextServer* server = context->server;

      if (!allDomainEmpty)
      {
         StdString filename = getFileOutputName();

// determine splitting format in the file name  : firstPart%start_date%middlePart%end_date%lastPart

         std::string strStartDate="%start_date%" ;
         std::string strEndDate="%end_date%" ;

         std::string firstPart ;
         std::string middlePart ;
         std::string lastPart ;
         size_t pos1, pos2 ;
         bool hasStartDate=false ;
         bool hasEndDate=false ;
         bool hasSplit = (!split_freq.isEmpty());
                  
         pos1=filename.find(strStartDate) ;
         if (pos1!=std::string::npos)
         {
           firstPart=filename.substr(0,pos1) ;
           pos1+=strStartDate.size() ;
           hasStartDate=true ;
         }
         else pos1=0 ;

         pos2=filename.find(strEndDate,pos1) ;
         if (pos2!=std::string::npos)
         {
           middlePart=filename.substr(pos1,pos2-pos1) ;           
           pos2+=strEndDate.size() ;
           lastPart=filename.substr(pos2,filename.size()-pos2) ;
           hasEndDate=true ;
         }
         else middlePart=filename.substr(pos1,filename.size()) ;

         if (!hasStartDate && !hasEndDate)
         {
           hasStartDate=true ;
           hasEndDate=true;
           firstPart=middlePart ;
           if (hasSplit) firstPart +="_";
           middlePart="-" ;
         }
   
         StdOStringStream oss;

         if (!split_freq.isEmpty())
         {
           CDate split_start ;
           CDate splitEnd ;
           if (!split_start_offset.isEmpty()) split_start=lastSplit + split_start_offset ;
           else split_start=lastSplit ;

           splitEnd = lastSplit + split_freq ;
           if (!split_last_date.isEmpty())
           {
             CDate splitLastDate=CDate::FromString(split_last_date,*CContext::getCurrent()->getCalendar()) ;
             if( splitLastDate < splitEnd)  splitEnd=splitLastDate ;
           }
            
           if (!split_end_offset.isEmpty()) splitEnd = splitEnd + split_end_offset;
           else splitEnd = splitEnd - 1 * Second;

           string splitFormat;
           if (split_freq_format.isEmpty())
           {
             if (split_freq.getValue().second != 0) splitFormat = "%y%mo%d%h%mi%s";
             else if (split_freq.getValue().minute != 0) splitFormat = "%y%mo%d%h%mi";
             else if (split_freq.getValue().hour != 0) splitFormat = "%y%mo%d%h";
             else if (split_freq.getValue().day != 0) splitFormat = "%y%mo%d";
             else if (split_freq.getValue().month != 0) splitFormat = "%y%mo";
             else splitFormat = "%y";
           }
           else splitFormat = split_freq_format;

           oss << firstPart ;
           if (hasStartDate) oss << split_start.getStr(splitFormat) ;
           oss << middlePart ;
           if (hasEndDate) oss << splitEnd.getStr(splitFormat);
           oss << lastPart ;

           StdString keySuffix("CContext_"+CContext::getCurrent()->getId()+"::CFile_"+getFileOutputName()+"::") ; 
           context->registryOut->setKey(keySuffix+"splitStart", lastSplit);
           context->registryOut->setKey(keySuffix+"splitEnd",   splitEnd);
         }
         else oss<<firstPart<<lastPart ;

        bool append = !this->append.isEmpty() && this->append.getValue();

         bool useClassicFormat = !format.isEmpty() && format == format_attr::netcdf4_classic;
         bool useCFConvention = convention.isEmpty() || convention == convention_attr::CF;

         bool multifile = true;
         if (!type.isEmpty())
         {
           if (type == type_attr::one_file) multifile = false;
           else if (type == type_attr::multiple_file) multifile = true;

         }
#ifndef USING_NETCDF_PAR
         if (!multifile)
         {
            info(0) << "!!! Warning -> Using non parallel version of netcdf, switching in multiple_file mode for file : " << filename << " ..." << endl;
            multifile = true;
          }
#endif
         if (multifile)
         {
            int commSize, commRank;
            MPI_Comm_size(fileComm, &commSize);
            MPI_Comm_rank(fileComm, &commRank);

            if (server->intraCommSize > 1)
            {
              oss << "_" ;
              int width=0; int n = commSize-1;
              while (n != 0) { n = n / 10; width++;}
              if (!min_digits.isEmpty())
                if (width < min_digits) width = min_digits;
              oss.width(width);
              oss.fill('0');
              oss << right << commRank;
            }
         }
         oss << ".nc";

         bool isCollective = par_access.isEmpty() ||  par_access == par_access_attr::collective;

         if (isOpen) data_out->closeFile();

        data_out = shared_ptr<CDataOutput>(new CNc4DataOutput(this, oss.str(), append, useClassicFormat, useCFConvention,
                                                              fileComm, multifile, isCollective, time_counter_name));
        isOpen = true;

        data_out->writeFile(CFile::get(this));

        // Do not recreate the file structure if opening an existing file
        if (!data_out->IsInAppendMode())
        {
          std::vector<CField*>::iterator it, end = this->enabledFields.end();
          for (it = this->enabledFields.begin(); it != end; it++)
          {
            CField* field = *it;
            this->data_out->writeFieldGrid(field);
          }
          this->data_out->writeTimeDimension();

          for (it = this->enabledFields.begin(); it != end; it++)
          {
            CField* field = *it;
            this->data_out->writeFieldTimeAxis(field);
          }
          
          for (it = this->enabledFields.begin(); it != end; it++)
          {
            CField* field = *it;
            this->data_out->writeField(field);
          }

          vector<CVariable*> listVars = getAllVariables();
          for (vector<CVariable*>::iterator it = listVars.begin(); it != listVars.end(); it++)
            this->data_out->writeAttribute(*it);

          this->data_out->definition_end();
        }
        else
        {
          // check time axis even in append mode
          std::vector<CField*>::iterator it, end = this->enabledFields.end();
          for (it = this->enabledFields.begin(); it != end; it++)
          {
            CField* field = *it;
            this->data_out->writeFieldTimeAxis(field);
          }
        }
      }
   }
Beispiel #7
0
PetscErrorCode CreateMesh(MPI_Comm comm, AppCtx *user, DM *dm)
{
  PetscInt       dim               = user->dim;
  PetscBool      interpolate       = user->interpolate;
  PetscBool      refinementUniform = user->refinementUniform;
  PetscReal      refinementLimit   = user->refinementLimit;
  PetscBool      cellSimplex       = user->cellSimplex;
  const char    *filename          = user->filename;
  const char    *partitioner       = "chaco";
  size_t         len;
  PetscMPIInt    rank;
  PetscErrorCode ierr;

  PetscFunctionBegin;
  ierr = PetscLogEventBegin(user->createMeshEvent,0,0,0,0);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
  ierr = PetscStrlen(filename, &len);CHKERRQ(ierr);
  if (len) {
#if defined(PETSC_HAVE_CGNS)
    int cgid = -1;

    if (!rank) {
      ierr = cg_open(filename, CG_MODE_READ, &cgid);CHKERRQ(ierr);
      if (cgid <= 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "cg_open(\"%s\",...) did not return a valid file ID", filename);
    }
    ierr = DMPlexCreateCGNS(comm, cgid, interpolate, dm);CHKERRQ(ierr);
    if (!rank) {ierr = cg_close(cgid);CHKERRQ(ierr);}
#else
    SETERRQ(comm, PETSC_ERR_SUP, "Loading meshes requires CGNS support. Reconfigure using --with-cgns-dir");
#endif
  } else if (cellSimplex) {
    ierr = DMPlexCreateBoxMesh(comm, dim, interpolate, dm);CHKERRQ(ierr);
  } else {
    const PetscInt cells[3] = {2, 2, 2};

    ierr = DMPlexCreateHexBoxMesh(comm, dim, cells, dm);CHKERRQ(ierr);
  }
  {
    DM refinedMesh     = NULL;
    DM distributedMesh = NULL;

    /* Refine mesh using a volume constraint */
    ierr = DMPlexSetRefinementUniform(*dm, PETSC_FALSE);CHKERRQ(ierr);
    ierr = DMPlexSetRefinementLimit(*dm, refinementLimit);CHKERRQ(ierr);
    ierr = DMRefine(*dm, comm, &refinedMesh);CHKERRQ(ierr);
    if (refinedMesh) {
      ierr = DMDestroy(dm);CHKERRQ(ierr);
      *dm  = refinedMesh;
    }
    /* Distribute mesh over processes */
    ierr = DMPlexDistribute(*dm, partitioner, 0, &distributedMesh);CHKERRQ(ierr);
    if (distributedMesh) {
      ierr = DMDestroy(dm);CHKERRQ(ierr);
      *dm  = distributedMesh;
    }
    if (refinementUniform) {
      ierr = DMPlexSetRefinementUniform(*dm, refinementUniform);CHKERRQ(ierr);
      ierr = DMRefine(*dm, comm, &refinedMesh);CHKERRQ(ierr);
      if (refinedMesh) {
        ierr = DMDestroy(dm);CHKERRQ(ierr);
        *dm  = refinedMesh;
      }
    }
  }
  ierr     = PetscObjectSetName((PetscObject) *dm, "Simplical Mesh");CHKERRQ(ierr);
  ierr     = DMSetFromOptions(*dm);CHKERRQ(ierr);
  ierr     = PetscLogEventEnd(user->createMeshEvent,0,0,0,0);CHKERRQ(ierr);
  user->dm = *dm;
  PetscFunctionReturn(0);
}
Beispiel #8
0
int mcfft3_init(int pad1           /* padding on the first axis */,
	       int nx,   int ny,  int nz   /* input data size */, 
	       int *nx2, int *ny2, int *nz2 /* padded data size */,
               int *n_local, int *o_local /* local size & start */)
/*< initialize >*/
{
  int i, nth=1;
  int cpuid;

  MPI_Comm_rank(MPI_COMM_WORLD, &cpuid);

  fftwf_mpi_init();

  /* axis 1 */
  nk = n1 = kiss_fft_next_fast_size(nx*pad1);
  /* axis 2 */
  n2 = kiss_fft_next_fast_size(ny);
  /* axis 3 */
  n3 = kiss_fft_next_fast_size(nz);

  alloc_local = fftwf_mpi_local_size_2d_transposed(n3, n2*n1, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start);

  cc = sf_complexalloc(n1*n2*local_n0);

  /* kiss-fft */

#ifdef _OPENMP
#pragma omp parallel
  {nth = omp_get_num_threads();}
#endif

  cfg1  = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  icfg1 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  cfg2  = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  icfg2 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  cfg3  = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));
  icfg3 = (kiss_fft_cfg *) sf_alloc(nth,sizeof(kiss_fft_cfg));

  for (i=0; i < nth; i++) {
    cfg1[i] = kiss_fft_alloc(n1,0,NULL,NULL);
    icfg1[i]= kiss_fft_alloc(n1,1,NULL,NULL);
    cfg2[i] = kiss_fft_alloc(n2,0,NULL,NULL);
    icfg2[i]= kiss_fft_alloc(n2,1,NULL,NULL);
    cfg3[i] = kiss_fft_alloc(n3,0,NULL,NULL);
    icfg3[i]= kiss_fft_alloc(n3,1,NULL,NULL);
  }

  ctrace2= (kiss_fft_cpx **) sf_complexalloc2(n2,nth);
  ctrace3= (kiss_fft_cpx **) sf_complexalloc2(n3,nth);

  //tmp = (kiss_fft_cpx *) sf_complexalloc(alloc_local);
  tmp =    (kiss_fft_cpx *) sf_alloc(alloc_local,sizeof(kiss_fft_cpx));
  tmp2= (sf_complex *) tmp;

  /* fftw for transpose */

  cfg = fftwf_mpi_plan_many_transpose(n3,n2*n1,2,
                              FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK,
                              (float *) tmp,
                              (float *) tmp,
                              MPI_COMM_WORLD,
                              FFTW_MEASURE);

  icfg= fftwf_mpi_plan_many_transpose(n2*n1,n3,2,
                              FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK,
                              (float *) tmp,
                              (float *) tmp,
                              MPI_COMM_WORLD,
                              FFTW_MEASURE);

  if (NULL == cfg || NULL == icfg) sf_error("FFTW failure.");

  *nx2 = n1;
  *ny2 = n2;
  *nz2 = n3;
  *n_local = (int) local_n0;
  *o_local = (int) local_0_start;
	
  wt =  1.0/(n3*n2*n1);

  return (nk*n2*n3);
}
Beispiel #9
0
int main(int argc, char **argv)
{
	int rank, size;
	int i, j, k;
	int **table, *buffer;
	int begin_row, end_row, send_count, *recv_counts, *displs;
	char errmsg[200];

	MPI_Init( &argc, &argv );
	Test_Init_No_File();
	
	MPI_Comm_rank( MPI_COMM_WORLD, &rank );
	MPI_Comm_size( MPI_COMM_WORLD, &size );

	/* get buffer space and init table */
	buffer      = (int *) malloc( (size * BLOCKSIZE) * (size * BLOCKSIZE) * sizeof(int) );
	table       = (int **)malloc( (size * BLOCKSIZE) * sizeof(int *) );
	recv_counts = (int *) malloc( size * sizeof(int) );
	displs      = (int *) malloc( size * sizeof(int) );
	if( !buffer || !table || !recv_counts || !displs ) {
		fprintf( stderr, "Out of memory error!\n" );
		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
	}
	for( i = 0; i < size * BLOCKSIZE; i++ )
		table[i] = &(buffer[i*size*BLOCKSIZE]);

	/* Determine what rows are my responsibility */
	begin_row = rank * BLOCKSIZE;
	end_row   = (rank + 1) * BLOCKSIZE;
	send_count = BLOCKSIZE * size * BLOCKSIZE;
	for( i = 0; i < size; i++ ) {
		recv_counts[i] = BLOCKSIZE * size * BLOCKSIZE;
		displs[i]      = i * BLOCKSIZE * size * BLOCKSIZE;
	}

	/* Paint my rows my color */
	for( i = begin_row; i < end_row ; i++ )
		for( j = 0; j < size * BLOCKSIZE; j++ )
			table[i][j] = rank + 10;

	/* Gather everybody's result together - sort of like an */
	/* inefficient allgather */
	for (i = 0; i < size; i++)
		MPI_Gatherv(table[begin_row], send_count, MPI_INT,
					table[0], recv_counts, displs, MPI_INT, i,
					MPI_COMM_WORLD);

	/* Everybody should have the same table now. */
	for( i = 0; i < size; i++ )
		for( j = 0; j < BLOCKSIZE; j++ )
			for( k = 0; k < size * BLOCKSIZE; k++ )
				if( table[i*BLOCKSIZE+j][k] != i + 10 ) {
					sprintf(errmsg, "[%d] got %d expected %d for %dth entry in row %d\n",
							rank, table[i*BLOCKSIZE+j][k], i + 10, k, i*BLOCKSIZE + j);
					Test_Message( errmsg );
					Test_Failed( NULL );
				}
	
	Test_Waitforall();
	Test_Global_Summary();

	free( buffer );
	free( table );
	free( recv_counts );
	free( displs );

	MPI_Finalize();
	exit( EXIT_SUCCESS );
}
Beispiel #10
0
int main( int argc , char ** argv )
{
  int comm_rank = 0 ;

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Init( & argc , & argv );
  MPI_Comm comm = MPI_COMM_WORLD ;
  MPI_Comm_rank( comm , & comm_rank );
#else
  MPI_Comm comm = 0 ;
  (void) comm ; // suppress warning
#endif

  int cmdline[ CMD_COUNT ] ;

  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;

  if ( 0 == comm_rank ) {
    for ( int i = 1 ; i < argc ; ++i ) {
      if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
        cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
      }
      else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
        cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
      }
      else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
        sscanf( argv[++i] , "%dx%d" ,
                cmdline + CMD_USE_NUMA ,
                cmdline + CMD_USE_CORE_PER_NUMA );
      }
      else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
        cmdline[ CMD_USE_CUDA ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
        cmdline[ CMD_USE_CUDA ] = 1 ;
        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
      }
      else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
        cmdline[ CMD_USE_ROCM ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
        sscanf( argv[++i] , "%dx%dx%d" ,
                cmdline + CMD_USE_FIXTURE_X ,
                cmdline + CMD_USE_FIXTURE_Y ,
                cmdline + CMD_USE_FIXTURE_Z );
      }
      else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
        sscanf( argv[++i] , "%d..%d" ,
                cmdline + CMD_USE_FIXTURE_BEGIN ,
                cmdline + CMD_USE_FIXTURE_END );
      }
      else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
        cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
        cmdline[ CMD_USE_ATOMIC ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
        cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
      }
      else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
        cmdline[ CMD_VTUNE ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
        cmdline[ CMD_PRINT ] = 1 ;
      }
      else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
        cmdline[ CMD_ECHO ] = 1 ;
      }
      else {
        cmdline[ CMD_ERROR ] = 1 ;

        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
      }
    }

    if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
  }

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
#endif

  if ( cmdline[ CMD_VTUNE ] ) {
    std::stringstream cmd;
    pid_t my_os_pid=getpid();
    const std::string vtune_loc =
      "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
    const std::string output_dir = "./vtune/vtune.";
    const int p_rank = comm_rank;
    cmd << vtune_loc
        << " -collect hotspots -result-dir " << output_dir << p_rank
        << " -target-pid " << my_os_pid << " &";
    if (p_rank == 0)
      std::cout << cmd.str() << std::endl;
    system(cmd.str().c_str());
    system("sleep 10");
  }

  if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {

    if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }

    if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
      cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
      cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
      cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
    }

#if defined( KOKKOS_ENABLE_THREADS )

    if ( cmdline[ CMD_USE_THREADS ] ) {

      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
                                     cmdline[ CMD_USE_NUMA ] ,
                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
      }
      else {
        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
      }

      run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::Threads::finalize();
    }

#endif

#if defined( KOKKOS_ENABLE_OPENMP )

    if ( cmdline[ CMD_USE_OPENMP ] ) {

      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
                                     cmdline[ CMD_USE_NUMA ] ,
                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
      }
      else {
        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
      }

      run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::OpenMP::finalize();
    }

#endif

#if defined( KOKKOS_ENABLE_CUDA )
    if ( cmdline[ CMD_USE_CUDA ] ) {
      // Use the last device:

      Kokkos::HostSpace::execution_space::initialize();
      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );

      run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::Cuda::finalize();
      Kokkos::HostSpace::execution_space::finalize();
    }

#endif

#if defined( KOKKOS_ENABLE_ROCM )
    if ( cmdline[ CMD_USE_ROCM ] ) {
      // Use the last device:

      Kokkos::HostSpace::execution_space::initialize();
      Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) );

      run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );

      Kokkos::Experimental::ROCm::finalize();
      Kokkos::HostSpace::execution_space::finalize();
    }

#endif

  }

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Finalize();
#endif

  return cmdline[ CMD_ERROR ] ? -1 : 0 ;
}
Beispiel #11
0
int main(int argc, char* argv[])
{
  bool verb;        
  int it,iz,im,ikz,ikx,iky,ix,iy,i,j,snap;     /* index variables */
  int nt,nz,nx,ny, m2, nk, nzx, nz2, nx2, ny2, nzx2, n2, pad1;
  float dt;
  sf_complex c;

  float  *rr;      /* I/O arrays*/
  sf_complex *cwave, *cwavem, *ww;
  sf_complex **wave, *curr;
  float *rcurr, *rcurr_all;

  sf_file Fw,Fr,Fo;    /* I/O files */
  sf_axis at,az,ax,ay;    /* cube axes */

  sf_complex **lt, **rt;
  sf_file left, right, snaps;

  /*MPI related*/
  int cpuid,numprocs;
  int provided;
  int n_local, o_local;
  int ozx2;
  float *sendbuf, *recvbuf;
  int *rcounts, *displs;

  MPI_Init_thread(&argc,&argv,MPI_THREAD_FUNNELED,&provided);
  threads_ok = provided >= MPI_THREAD_FUNNELED;

  sf_init(argc,argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &cpuid);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

  if(!sf_getbool("verb",&verb)) verb=true; /* verbosity */

  /* setup I/O files */
  Fw = sf_input ("--input" );
  Fo = sf_output("--output");
  Fr = sf_input ("ref");

  /* Read/Write axes */
  at = sf_iaxa(Fw,1); nt = sf_n(at); dt = sf_d(at); 
  az = sf_iaxa(Fr,1); nz = sf_n(az); 
  ax = sf_iaxa(Fr,2); nx = sf_n(ax); 
  ay = sf_iaxa(Fr,3); ny = sf_n(ay); 

  if (!sf_getint("pad1",&pad1)) pad1=1; /* padding factor on the first axis */

  if (!sf_getint("snap",&snap)) snap=0;
  /* interval for snapshots */
    
  if (cpuid==0) {

    sf_oaxa(Fo,az,1); 
    sf_oaxa(Fo,ax,2);
    sf_oaxa(Fo,ay,3);
    
    sf_settype(Fo,SF_FLOAT);

    if (snap > 0) {
      snaps = sf_output("snaps");
      /* (optional) snapshot file */
	
      sf_oaxa(snaps,az,1); 
      sf_oaxa(snaps,ax,2);
      sf_oaxa(snaps,ay,3);
      sf_oaxa(snaps,at,4);
      sf_settype(snaps,SF_FLOAT);
      sf_putint(snaps,"n4",nt/snap);
      sf_putfloat(snaps,"d4",dt*snap);
      sf_putfloat(snaps,"o4",0.);
    } else {
      snaps = NULL;
    }

  }

  //nk = cfft3_init(pad1,nz,nx,ny,&nz2,&nx2,&ny2);
  //n_local = ny2;
  //o_local = 0;
  nk = mcfft3_init(pad1,nz,nx,ny,&nz2,&nx2,&ny2,&n_local,&o_local);
  sf_warning("Cpuid=%d,n2=%d,n1=%d,n0=%d,local_n0=%d,local_0_start=%d",cpuid,nz2,nx2,ny2,n_local,o_local);

  nzx = nz*nx*ny;
  //nzx2 = nz2*nx2*ny2;
  nzx2 = n_local*nz2*nx2;
  ozx2 = o_local*nz2*nx2;

  /* propagator matrices */
  left = sf_input("left");
  right = sf_input("right");

  if (!sf_histint(left,"n1",&n2) || n2 != nzx) sf_error("Need n1=%d in left",nzx);
  if (!sf_histint(left,"n2",&m2))  sf_error("Need n2=%d in left",m2);
    
  if (!sf_histint(right,"n1",&n2) || n2 != m2) sf_error("Need n1=%d in right",m2);
  if (!sf_histint(right,"n2",&n2) || n2 != nk) sf_error("Need n2=%d in right",nk);
 
  lt = sf_complexalloc2(nzx,m2);
  rt = sf_complexalloc2(m2,nk);

  sf_complexread(lt[0],nzx*m2,left);
  sf_complexread(rt[0],m2*nk,right);

  /* read wavelet & reflectivity */
  ww=sf_complexalloc(nt);  sf_complexread(ww,nt ,Fw);
  rr=sf_floatalloc(nzx); sf_floatread(rr,nzx,Fr);

  curr = sf_complexalloc(nzx2);
  rcurr= sf_floatalloc(nzx2);

  cwave  = sf_complexalloc(nzx2);
  cwavem = sf_complexalloc(nzx2);
  wave = sf_complexalloc2(nzx2,m2);

  //icfft3_allocate(cwavem);

  for (iz=0; iz < nzx2; iz++) {
    curr[iz]=sf_cmplx(0.,0.);
    rcurr[iz]=0.;
  }

  sendbuf = rcurr;
  if (cpuid==0) {
    rcurr_all = sf_floatalloc(nz2*nx2*ny2);
    recvbuf = rcurr_all;
    rcounts = sf_intalloc(numprocs);
    displs  = sf_intalloc(numprocs);
  } else {
    rcurr_all = NULL;
    recvbuf = NULL;
    rcounts = NULL;
    displs = NULL;
  }

  MPI_Gather(&nzx2, 1, MPI_INT, rcounts, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Gather(&ozx2, 1, MPI_INT, displs, 1, MPI_INT, 0, MPI_COMM_WORLD);

  /* MAIN LOOP */
  for (it=0; it<nt; it++) {
    if(verb) sf_warning("it=%d;",it);

    /* matrix multiplication */
    mcfft3(curr,cwave);

    for (im = 0; im < m2; im++) {
      for (iky = 0; iky < n_local; iky++) {
        for (ikx = 0; ikx < nx2; ikx++) {
          for (ikz = 0; ikz < nz2; ikz++) {
            i = ikz + ikx*nz2 + (o_local+iky)*nx2*nz2;
            j = ikz + ikx*nz2 + iky*nx2*nz2;
#ifdef SF_HAS_COMPLEX_H
            cwavem[j] = cwave[j]*rt[i][im];
#else
            cwavem[j] = sf_cmul(cwave[j],rt[i][im]);
#endif
          }
        }
      }
      imcfft3(wave[im],cwavem);
    }

    for (iy = 0; iy < n_local && (iy+o_local)<ny; iy++) {
      for (ix = 0; ix < nx; ix++) {
        for (iz=0; iz < nz; iz++) {
          i = iz + ix*nz + (o_local+iy)*nx*nz;  /* original grid */
          j = iz + ix*nz2+ iy*nx2*nz2; /* padded grid */
#ifdef SF_HAS_COMPLEX_H		
          c = ww[it] * rr[i];
#else
          c = sf_crmul(ww[it],rr[i]);
#endif

          for (im = 0; im < m2; im++) {
#ifdef SF_HAS_COMPLEX_H
            c += lt[im][i]*wave[im][j];
#else
            c += sf_cmul(lt[im][i],wave[im][j]);
#endif
          }
		    
          curr[j] = c;
          rcurr[j]= crealf(c);
        }
      }
    }

    /* output movie */
    if (NULL != snaps && 0 == it%snap) {
      MPI_Gatherv(sendbuf, nzx2, MPI_FLOAT, recvbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD);

      if (cpuid==0) {
        for (iy = 0; iy < ny; iy++)
          for (ix = 0; ix < nx; ix++)
            sf_floatwrite(rcurr_all+nz2*(ix+nx2*iy),nz,snaps);
      }
    }

  }
  if(verb) sf_warning(".");    
	    	
  /* write wavefield to output */
  MPI_Gatherv(sendbuf, nzx2, MPI_FLOAT, recvbuf, rcounts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD);
  if (cpuid==0) {
    for (iy = 0; iy < ny; iy++)
      for (ix = 0; ix < nx; ix++)
        sf_floatwrite(rcurr_all+nz2*(ix+nx2*iy),nz,Fo);
  }
    
  mcfft3_finalize();

  MPI_Finalize();
  exit (0);
}
Beispiel #12
0
void run( MPI_Comm comm , const int cmd[] )
{
  int comm_rank = 0 ;

#if defined( KOKKOS_ENABLE_MPI )
  MPI_Comm_rank( comm , & comm_rank );
#else
  comm = 0 ;
#endif


  if ( 0 == comm_rank ) {
    if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
    else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
    else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
    else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; }

    if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
    else { std::cout << " , LINEAR-ELEMENT" ; }

    if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
  }

  std::vector< std::pair<std::string,std::string> > headers;


  headers.push_back(std::make_pair("ELEMS","count"));
  headers.push_back(std::make_pair("NODES","count"));
  headers.push_back(std::make_pair("NEWTON","iter"));
  headers.push_back(std::make_pair("CG","iter"));
  headers.push_back(std::make_pair("MAP_RATIO","ratio"));
  headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
  headers.push_back(std::make_pair("SCAN/NODE","millisec"));
  headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
  headers.push_back(std::make_pair("SORT/NODE","millisec"));
  headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
  headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
  headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
  headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
  headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
  headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
  headers.push_back(std::make_pair("ERROR","ratio"));

  // find print widths
  size_t min_width = 10;
  std::vector< size_t > widths(headers.size());
  for (size_t i=0, ie=headers.size(); i<ie; ++i)
    widths[i] = std::max(min_width, headers[i].first.size()+1);

  // print column headers
  if ( 0 == comm_rank ) {
    std::cout << std::endl ;
    for (size_t i=0; i<headers.size(); ++i)
      std::cout << std::setw(widths[i]) << headers[i].first << " ,";
    std::cout << "\b\b  " << std::endl;
    for (size_t i=0; i<headers.size(); ++i)
      std::cout << std::setw(widths[i]) << headers[i].second << " ,";
    std::cout << "\b\b  " << std::endl;

    std::cout << std::scientific;
    std::cout.precision(3);
  }

  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
    for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
      int nelem[3] ;
      nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
      nelem[1] = 1 + nelem[0] ;
      nelem[2] = 2 * nelem[0] ;

      const Kokkos::Example::FENL::Perf perf =
        cmd[ CMD_USE_FIXTURE_QUADRATIC ]
        ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
        : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
        ;

      if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
    }
  }
  else {
    int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
                     cmd[ CMD_USE_FIXTURE_Y ] ,
                     cmd[ CMD_USE_FIXTURE_Z ] };

    const Kokkos::Example::FENL::Perf perf =
      cmd[ CMD_USE_FIXTURE_QUADRATIC ]
      ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
      : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
      ;

    if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
  }
}
Beispiel #13
0
int main(int argc,char **argv)
{
  PetscMPIInt      rank;
  PetscInt         M = -10,N = -8;
  PetscErrorCode   ierr;
  PetscBool        flg = PETSC_FALSE;
  DM               da;
  PetscViewer      viewer;
  Vec              local,global;
  PetscScalar      value;
  DMDABoundaryType bx = DMDA_BOUNDARY_NONE,by = DMDA_BOUNDARY_NONE;
  DMDAStencilType  stype = DMDA_STENCIL_BOX;
#if defined(PETSC_HAVE_MATLAB_ENGINE)
  PetscViewer      mviewer;
#endif

  ierr = PetscInitialize(&argc,&argv,(char*)0,help);CHKERRQ(ierr);
  ierr = PetscViewerDrawOpen(PETSC_COMM_WORLD,0,"",300,0,300,300,&viewer);CHKERRQ(ierr);
#if defined(PETSC_HAVE_MATLAB_ENGINE)
  ierr = PetscViewerMatlabOpen(PETSC_COMM_WORLD,"tmp.mat",FILE_MODE_WRITE,&mviewer);CHKERRQ(ierr);
#endif

  ierr = PetscOptionsGetBool(PETSC_NULL,"-star_stencil",&flg,PETSC_NULL);CHKERRQ(ierr);
  if (flg) stype = DMDA_STENCIL_STAR;

  /* Create distributed array and get vectors */
  ierr = DMDACreate2d(PETSC_COMM_WORLD,bx,by,stype,M,N,PETSC_DECIDE,PETSC_DECIDE,1,1,PETSC_NULL,PETSC_NULL,&da);CHKERRQ(ierr);
  ierr = DMCreateGlobalVector(da,&global);CHKERRQ(ierr);
  ierr = DMCreateLocalVector(da,&local);CHKERRQ(ierr);

  value = -3.0;
  ierr = VecSet(global,value);CHKERRQ(ierr);
  ierr = DMGlobalToLocalBegin(da,global,INSERT_VALUES,local);CHKERRQ(ierr);
  ierr = DMGlobalToLocalEnd(da,global,INSERT_VALUES,local);CHKERRQ(ierr);

  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
  value = rank+1;
  ierr = VecScale(local,value);CHKERRQ(ierr);
  ierr = DMLocalToGlobalBegin(da,local,ADD_VALUES,global);CHKERRQ(ierr);
  ierr = DMLocalToGlobalEnd(da,local,ADD_VALUES,global);CHKERRQ(ierr);

  flg  = PETSC_FALSE;
  ierr = PetscOptionsGetBool(PETSC_NULL, "-view_global", &flg,PETSC_NULL);CHKERRQ(ierr);
  if (flg) { /* view global vector in natural ordering */
    ierr = VecView(global,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  }
  ierr = DMView(da,viewer);CHKERRQ(ierr);
  ierr = VecView(global,viewer);CHKERRQ(ierr);
#if defined(PETSC_HAVE_MATLAB_ENGINE)
  ierr = DMView(da,mviewer);CHKERRQ(ierr);
  ierr = VecView(global,mviewer);CHKERRQ(ierr);
#endif

  /* Free memory */
#if defined(PETSC_HAVE_MATLAB_ENGINE)
  ierr = PetscViewerDestroy(&mviewer);CHKERRQ(ierr);
#endif
  ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
  ierr = VecDestroy(&local);CHKERRQ(ierr);
  ierr = VecDestroy(&global);CHKERRQ(ierr);
  ierr = DMDestroy(&da);CHKERRQ(ierr);
  ierr = PetscFinalize();
  return 0;
}
Beispiel #14
0
int HYPRE_LSI_PolySetup(HYPRE_Solver solver, HYPRE_ParCSRMatrix A_csr,
                        HYPRE_ParVector b,   HYPRE_ParVector x )
{
   int            i, j, my_id, startRow, endRow, order;
   int            pos_diag, neg_diag;
   int            rowLeng, *colInd, *row_partition;
   double         *coefs=NULL, rowsum, max_norm, *colVal;
   HYPRE_LSI_Poly *poly_ptr = (HYPRE_LSI_Poly *) solver;
#ifndef HYPRE_SEQUENTIAL
   double         dtemp;
#endif

   /* ---------------------------------------------------------------- */
   /* initialize structure                                             */
   /* ---------------------------------------------------------------- */

   order = poly_ptr->order;
   coefs = (double *) malloc((order+1) * sizeof(double));
   poly_ptr->coefficients = coefs;

   /* ---------------------------------------------------------------- */
   /* compute matrix norm                                              */
   /* ---------------------------------------------------------------- */

   HYPRE_ParCSRMatrixGetRowPartitioning( A_csr, &row_partition );
#ifdef HYPRE_SEQUENTIAL
   my_id = 0;
#else
   MPI_Comm_rank(poly_ptr->comm, &my_id);
#endif

   startRow  = row_partition[my_id];
   endRow    = row_partition[my_id+1] - 1;
   hypre_TFree( row_partition ); 
   poly_ptr->Nrows = endRow - startRow + 1;

   max_norm = 0.0;
   pos_diag = neg_diag = 0;
   for ( i = startRow; i <= endRow; i++ )
   {
      HYPRE_ParCSRMatrixGetRow(A_csr, i, &rowLeng, &colInd, &colVal);
      rowsum = 0.0;
      for (j = 0; j < rowLeng; j++)
      {
         rowsum += habs(colVal[j]);
         if ( colInd[j] == i && colVal[j] > 0.0 ) pos_diag++;
         if ( colInd[j] == i && colVal[j] < 0.0 ) neg_diag++;
      }
      if ( rowsum > max_norm ) max_norm = rowsum;
      HYPRE_ParCSRMatrixRestoreRow(A_csr, i, &rowLeng, &colInd, &colVal);
   }
#ifndef HYPRE_SEQUENTIAL
   MPI_Allreduce(&max_norm, &dtemp, 1, MPI_INT, MPI_MAX, poly_ptr->comm); 
#endif
   if ( pos_diag == 0 && neg_diag > 0 ) max_norm = - max_norm;

   /* ---------------------------------------------------------------- */
   /* fill in the coefficient table                                    */
   /* ---------------------------------------------------------------- */

   switch ( order ) 
   {
       case 0: coefs[0] = 1.0;     break;
       case 1: coefs[0] = 5.0;     coefs[1] = -1.0;   break;
       case 2: coefs[0] = 14.0;    coefs[1] = -7.0;   coefs[2] = 1.0; 
               break;
       case 3: coefs[0] = 30.0;    coefs[1] = -27.0;  coefs[2] = 9.0; 
               coefs[3] = -1.0;    break;
       case 4: coefs[0] = 55.0;    coefs[1] = -77.0;   coefs[2] = 44.0;
               coefs[3] = -11.0;   coefs[4] = 1.0;     break;
       case 5: coefs[0] = 91.0;    coefs[1] = -182.0;  coefs[2] = 156.0;
               coefs[3] = -65.0;   coefs[4] = 13.0;    coefs[5] = -1.0;
               break;
       case 6: coefs[0] = 140.0;   coefs[1] = -378.0;  coefs[2] = 450.0;
               coefs[3] = -275.0;  coefs[4] = 90.0;    coefs[5] = -15.0;
               coefs[6] = 1.0;     break;
       case 7: coefs[0] = 204.0;   coefs[1] = -714.0;  coefs[2] = 1122.0; 
               coefs[3] = -935.0;  coefs[4] = 442.0;   coefs[5] = -119.0;
               coefs[6] = 17.0;    coefs[7] = -1.0;    break;
       case 8: coefs[0] = 285.0;   coefs[1] = -1254.0; coefs[2] = 2508.0;
               coefs[3] = -2717.0; coefs[4] = 1729.0;  coefs[5] = -665.0;
               coefs[6] = 152.0;   coefs[7] = -19.0;   coefs[8] = 1.0;
               break;
   }
   for( i = 0; i <= order; i++ )
      coefs[i] *= pow( 4.0 / max_norm, (double) i);

   return 0;
}
Beispiel #15
0
int main(int argc,char **args)
{
  Mat             A;
  PetscErrorCode  ierr;
  PetscMPIInt     rank,size;
  PetscInt        *ia,*ja;
  MatPartitioning part;
  IS              is,isn,isrows;
  IS              coarseparts,fineparts;
  MPI_Comm        comm;

  ierr = PetscInitialize(&argc,&args,(char*)0,help);if (ierr) return ierr;
  comm = PETSC_COMM_WORLD;
  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  if (size != 4) SETERRQ(comm,1,"Must run with 4 processors");
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);

  ierr = PetscMalloc1(5,&ia);CHKERRQ(ierr);
  ierr = PetscMalloc1(16,&ja);CHKERRQ(ierr);
  if (!rank) {
    ja[0] = 1; ja[1] = 4; ja[2] = 0; ja[3] = 2; ja[4] = 5; ja[5] = 1; ja[6] = 3; ja[7] = 6;
    ja[8] = 2; ja[9] = 7;
    ia[0] = 0; ia[1] = 2; ia[2] = 5; ia[3] = 8; ia[4] = 10;
  } else if (rank == 1) {
    ja[0] = 0; ja[1] = 5; ja[2] = 8; ja[3] = 1; ja[4] = 4; ja[5] = 6; ja[6] = 9; ja[7] = 2;
    ja[8] = 5; ja[9] = 7; ja[10] = 10; ja[11] = 3; ja[12] = 6; ja[13] = 11;
    ia[0] = 0; ia[1] = 3; ia[2] = 7; ia[3] = 11; ia[4] = 14;
  } else if (rank == 2) {
    ja[0] = 4; ja[1] = 9; ja[2] = 12; ja[3] = 5; ja[4] = 8; ja[5] = 10; ja[6] = 13; ja[7] = 6;
    ja[8] = 9; ja[9] = 11; ja[10] = 14; ja[11] = 7; ja[12] = 10; ja[13] = 15;
    ia[0] = 0; ia[1] = 3; ia[2] = 7; ia[3] = 11; ia[4] = 14;
  } else {
    ja[0] = 8; ja[1] = 13; ja[2] = 9; ja[3] = 12; ja[4] = 14; ja[5] = 10; ja[6] = 13; ja[7] = 15;
    ja[8] = 11; ja[9] = 14;
    ia[0] = 0; ia[1] = 2; ia[2] = 5; ia[3] = 8; ia[4] = 10;
  }
  ierr = MatCreateMPIAdj(comm,4,16,ia,ja,NULL,&A);CHKERRQ(ierr);
  ierr = MatView(A,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  /*
   Partition the graph of the matrix
  */
  ierr = MatPartitioningCreate(comm,&part);CHKERRQ(ierr);
  ierr = MatPartitioningSetAdjacency(part,A);CHKERRQ(ierr);
  ierr = MatPartitioningSetType(part,MATPARTITIONINGHIERARCH);CHKERRQ(ierr);
  ierr = MatPartitioningHierarchicalSetNcoarseparts(part,2);CHKERRQ(ierr);
  ierr = MatPartitioningHierarchicalSetNfineparts(part,2);CHKERRQ(ierr);
  ierr = MatPartitioningSetFromOptions(part);CHKERRQ(ierr);
  /* get new processor owner number of each vertex */
  ierr = MatPartitioningApply(part,&is);CHKERRQ(ierr);
  /* coarse parts */
  ierr = MatPartitioningHierarchicalGetCoarseparts(part,&coarseparts);CHKERRQ(ierr);
  ierr = ISView(coarseparts,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  /* fine parts */
  ierr = MatPartitioningHierarchicalGetFineparts(part,&fineparts);CHKERRQ(ierr);
  ierr = ISView(fineparts,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  /* partitioning */
  ierr = ISView(is,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  /* get new global number of each old global number */
  ierr = ISPartitioningToNumbering(is,&isn);CHKERRQ(ierr);
  ierr = ISView(isn,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  ierr = ISBuildTwoSided(is,&isrows);CHKERRQ(ierr);
  ierr = ISView(isrows,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
  ierr = ISDestroy(&is);CHKERRQ(ierr);
  ierr = ISDestroy(&coarseparts);CHKERRQ(ierr);
  ierr = ISDestroy(&fineparts);CHKERRQ(ierr);
  ierr = ISDestroy(&isrows);CHKERRQ(ierr);
  ierr = ISDestroy(&isn);CHKERRQ(ierr);
  ierr = MatPartitioningDestroy(&part);CHKERRQ(ierr);
  /*
    Free work space.  All PETSc objects should be destroyed when they
    are no longer needed.
  */
  ierr = MatDestroy(&A);CHKERRQ(ierr);
  ierr = PetscFinalize();
  return ierr;
}
Beispiel #16
0
int MAIN__()
{
  int argc=1;
  char * name = "c_example";
  char ** argv ;
#else
int main(int argc, char ** argv)
{
#endif
  DMUMPS_STRUC_C id;

  int n = 6;
  int nz = 21;
  int irnL[] = {1,2,3,4,5,6,2,3,4,5,6,3,4,5,6,4,5,6,5,6,6};
  int jcnL[] = {1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6};
  int irnS[] = {1,2,3,2,3,3};
  int jcnS[] = {1,1,1,2,2,3};

  int *irn = irnL;
  int *jcn = jcnL;

  double a[21];
  double rhs[6];

  int myid, ierr, numP;
#if defined(MAIN_COMP)
  argv = &name;
#endif
  ierr = MPI_Init(&argc, &argv);
  ierr = MPI_Comm_rank(MPI_COMM_WORLD, &myid);
  ierr = MPI_Comm_size(MPI_COMM_WORLD, &numP);
  
  double nump = numP * 1.0;

  /* Define A and rhs */
  rhs[0]=2.0;rhs[1]=2.0;rhs[2]=2.0;rhs[3]=2.0;rhs[4]=2.0;rhs[5]=2.0;

  a[0]=4169.95/nump;
  a[1]=0.0;
  a[2]=10075.0/nump;
  a[3]=-4030;
  a[4]=0.0;
  a[5]=0.0;
  a[6]=10084.0/nump;
  a[7]=-1612.0/nump;
  a[8]=0.0;
  a[9]=-8.9556;
  a[10]=-1612;
  a[11]=1354080.0/nump;
  a[12]=0.0;
  a[13]=1612.0;
  a[14]=193440.0;
  a[15]=4169.93;
  a[16]=0.0;
  a[17]=10075;
  a[18]=10084;
  a[19]=1612;
  a[20]=1354000;

  if (myid != 0) {
      nz = 6;

      a[0]=4169.95/nump;
      a[1]=0.0;
      a[2]=10075.0/nump;

      a[3]=10084.0/nump;
      a[4]=-1612.0/nump;

      a[5]=1354080.0/nump;

      irn = irnS;
      jcn = jcnS;
  }

#define ICNTL(I) icntl[(I)-1] /* macro s.t. indices match documentation */

  /* Initialize a MUMPS instance. Use MPI_COMM_WORLD */
  id.job=JOB_INIT; id.par=1; id.sym=2; id.comm_fortran=USE_COMM_WORLD;

  /* parallel solver; distributed i/p matrix A */
  id.ICNTL(5)=0; id.ICNTL(18)=3;  
  dmumps_c(&id);

  /* Define the problem on the host */
  /*
  id.n = n; id.nz =nz; id.irn=irn; id.jcn=jcn;
  id.a = a; id.rhs = rhs;
  */

  /* parallel solver; distributed i/p matrix A */

  id.ICNTL(5)=0; id.ICNTL(18)=3;  


  id.n = n; id.nz_loc =nz; id.irn_loc=irn; id.jcn_loc=jcn;
  id.a_loc = a; id.rhs = rhs;

/* No outputs */
  id.ICNTL(1)=-1; id.ICNTL(2)=-1; id.ICNTL(3)=-1; id.ICNTL(4)=0; 

        id.job=1;
       dmumps_c(&id);

  id.job=5;
  dmumps_c(&id);

  if (myid == 0) {
    printf("Solution is : (%8.6e %8.6e %8.6e %8.6e %8.6e %8.6e \n", rhs[0],rhs[1], rhs[2], rhs[3], rhs[4], rhs[5]);
  }

  rhs[0]=2.0;rhs[1]=2.0;rhs[2]=2.0;rhs[3]=1.0;rhs[4]=1.0;rhs[5]=1.0;

  id.job=3;
  dmumps_c(&id);

  /* Terminate instance */
  id.job=JOB_END; 
  dmumps_c(&id); 

  if (myid == 0) {
    printf("Solution is : (%8.6e %8.6e %8.6e %8.6e %8.6e %8.6e \n", rhs[0],rhs[1], rhs[2], rhs[3], rhs[4], rhs[5]);
  }

  ierr = MPI_Finalize();
  return 0;
}
Beispiel #17
0
std::unique_ptr<const bfly::PotentialField<R,d,q>>
transform
( const bfly::Context<R,d,q>& context,
  const Plan<d>& plan,
  const Amplitude<R,d>& amplitude,
  const Phase<R,d>& phase,
  const Box<R,d>& sBox,
  const Box<R,d>& tBox,
  const vector<Source<R,d>>& mySources )
{
#ifdef TIMING
    bfly::ResetTimers();
    bfly::timer.Start();
#endif
    typedef complex<R> C;
    const size_t q_to_d = Pow<q,d>::val;

    // Extract our communicator and its size
    MPI_Comm comm = plan.GetComm();
    int rank, numProcesses;
    MPI_Comm_rank( comm, &rank );
    MPI_Comm_size( comm, &numProcesses ); 

    // Get the problem-specific parameters
    const size_t N = plan.GetN();
    const size_t log2N = Log2( N );
    const array<size_t,d>& myInitialSBoxCoords = 
        plan.GetMyInitialSourceBoxCoords();
    const array<size_t,d>& log2InitialSBoxesPerDim = 
        plan.GetLog2InitialSourceBoxesPerDim();
    array<size_t,d> mySBoxCoords = myInitialSBoxCoords;
    array<size_t,d> log2SBoxesPerDim = log2InitialSBoxesPerDim;
    Box<R,d> mySBox;
    for( size_t j=0; j<d; ++j )
    {
        mySBox.widths[j] = sBox.widths[j] / (1u<<log2SBoxesPerDim[j]);
        mySBox.offsets[j] = sBox.offsets[j] + mySBox.widths[j]*mySBoxCoords[j];
    }

    array<size_t,d> myTBoxCoords, log2TBoxesPerDim;
    myTBoxCoords.fill(0);
    log2TBoxesPerDim.fill(0);
    Box<R,d> myTBox;
    myTBox = tBox;
    const size_t bootstrap = plan.GetBootstrapSkip();

    // Compute the number of source and target boxes that our process is 
    // responsible for initializing weights in
    size_t log2WeightGridSize = 0;
    size_t log2LocalSBoxes = 0;
    size_t log2LocalTBoxes = 0;
    array<size_t,d> log2LocalSBoxesPerDim, log2LocalTBoxesPerDim;
    log2LocalTBoxesPerDim.fill(0);
    for( size_t j=0; j<d; ++j )
    {
        if( log2N-log2SBoxesPerDim[j] >= bootstrap )
            log2LocalSBoxesPerDim[j]= (log2N-log2SBoxesPerDim[j]) - bootstrap;
        else
            log2LocalSBoxesPerDim[j] = 0;
        log2LocalTBoxesPerDim[j] = bootstrap;
        log2LocalSBoxes += log2LocalSBoxesPerDim[j];
        log2LocalTBoxes += log2LocalTBoxesPerDim[j];
        log2WeightGridSize += log2N-log2SBoxesPerDim[j];
    }

    // Initialize the weights using Lagrangian interpolation on the 
    // smooth component of the kernel.
    WeightGridList<R,d,q> weightGridList( 1u<<log2WeightGridSize );
#ifdef TIMING
    bfly::initializeWeightsTimer.Start();
#endif
    bfly::InitializeWeights
    ( context, plan, phase, sBox, tBox, mySBox, 
      log2LocalSBoxes, log2LocalSBoxesPerDim, mySources, weightGridList );
#ifdef TIMING
    bfly::initializeWeightsTimer.Stop();
#endif

    // Now cut the target domain if necessary
    for( size_t j=0; j<d; ++j )
    {
        if( log2LocalSBoxesPerDim[j] == 0 )
        {
            log2LocalTBoxesPerDim[j] -= bootstrap - (log2N-log2SBoxesPerDim[j]);
            log2LocalTBoxes -= bootstrap - (log2N-log2SBoxesPerDim[j]);
        }
    }

    // Start the main recursion loop
    if( bootstrap == log2N/2 )
    {
#ifdef TIMING
        bfly::M2LTimer.Start();
#endif
        bfly::M2L
        ( context, plan, amplitude, phase, sBox, tBox, mySBox, myTBox, 
          log2LocalSBoxes, log2LocalTBoxes,
          log2LocalSBoxesPerDim, log2LocalTBoxesPerDim, weightGridList );
#ifdef TIMING
        bfly::M2LTimer.Stop();
#endif
    }
    for( size_t level=bootstrap+1; level<=log2N; ++level )
    {
        // Compute the width of the nodes at this level
        array<R,d> wA, wB;
        for( size_t j=0; j<d; ++j )
        {
            wA[j] = tBox.widths[j] / (1<<level);
            wB[j] = sBox.widths[j] / (1<<(log2N-level));
        }

        if( log2LocalSBoxes >= d )
        {
            // Refine target domain and coursen the source domain
            for( size_t j=0; j<d; ++j )
            {
                --log2LocalSBoxesPerDim[j];
                ++log2LocalTBoxesPerDim[j];
            }
            log2LocalSBoxes -= d;
            log2LocalTBoxes += d;

            // Loop over boxes in target domain. 
            ConstrainedHTreeWalker<d> AWalker( log2LocalTBoxesPerDim );
            WeightGridList<R,d,q> oldWeightGridList( weightGridList );
            for( size_t tIndex=0; 
                 tIndex<(1u<<log2LocalTBoxes); ++tIndex, AWalker.Walk() )
            {
                const array<size_t,d> A = AWalker.State();

                // Compute coordinates and center of this target box
                array<R,d> x0A;
                for( size_t j=0; j<d; ++j )
                    x0A[j] = myTBox.offsets[j] + (A[j]+R(1)/R(2))*wA[j];

                // Loop over the B boxes in source domain
                ConstrainedHTreeWalker<d> BWalker( log2LocalSBoxesPerDim );
                for( size_t sIndex=0; 
                     sIndex<(1u<<log2LocalSBoxes); ++sIndex, BWalker.Walk() )
                {
                    const array<size_t,d> B = BWalker.State();

                    // Compute coordinates and center of this source box
                    array<R,d> p0B;
                    for( size_t j=0; j<d; ++j )
                        p0B[j] = mySBox.offsets[j] + (B[j]+R(1)/R(2))*wB[j];

                    // We are storing the interaction pairs source-major
                    const size_t iIndex = sIndex + (tIndex<<log2LocalSBoxes);

                    // Grab the interaction offset for the parent of target box 
                    // i interacting with the children of source box k
                    const size_t parentIOffset = 
                        ((tIndex>>d)<<(log2LocalSBoxes+d)) + (sIndex<<d);

                    if( level <= log2N/2 )
                    {
#ifdef TIMING
                        bfly::M2MTimer.Start();
#endif
                        bfly::M2M
                        ( context, plan, phase, level, x0A, p0B, wB, 
                          parentIOffset, oldWeightGridList,
                          weightGridList[iIndex] );
#ifdef TIMING
                        bfly::M2MTimer.Stop();
#endif
                    }
                    else
                    {
                        array<R,d> x0Ap;
                        array<size_t,d> globalA;
                        size_t ARelativeToAp = 0;
                        for( size_t j=0; j<d; ++j )
                        {
                            globalA[j] = A[j]+
                                (myTBoxCoords[j]<<log2LocalTBoxesPerDim[j]);
                            x0Ap[j] = tBox.offsets[j] + (globalA[j]|1)*wA[j];
                            ARelativeToAp |= (globalA[j]&1)<<j;
                        }
#ifdef TIMING
                        bfly::L2LTimer.Start();
#endif
                        bfly::L2L
                        ( context, plan, phase, level,
                          ARelativeToAp, x0A, x0Ap, p0B, wA, wB,
                          parentIOffset, oldWeightGridList, 
                          weightGridList[iIndex] );
#ifdef TIMING
                        bfly::L2LTimer.Stop();
#endif
                    }
                }
            }
        }
        else 
        {
            const size_t log2NumMerging = d-log2LocalSBoxes;

            log2LocalSBoxes = 0; 
            for( size_t j=0; j<d; ++j )
                log2LocalSBoxesPerDim[j] = 0;

            // Fully refine target domain and coarsen source domain.
            // We partition the target domain after the SumScatter.
            const vector<size_t>& sDimsToMerge = 
                plan.GetSourceDimsToMerge( level );
            for( size_t i=0; i<log2NumMerging; ++i )
            {
                const size_t j = sDimsToMerge[i];
                if( mySBoxCoords[j] & 1 )
                    mySBox.offsets[j] -= mySBox.widths[j];
                mySBoxCoords[j] /= 2;
                mySBox.widths[j] *= 2;
            }
            for( size_t j=0; j<d; ++j )
            {
                ++log2LocalTBoxesPerDim[j];
                ++log2LocalTBoxes;
            }

            // Compute the coordinates and center of this source box
            array<R,d> p0B;
            for( size_t j=0; j<d; ++j )
                p0B[j] = mySBox.offsets[j] + wB[j]/R(2);

            // Form the partial weights by looping over the boxes in the  
            // target domain.
            ConstrainedHTreeWalker<d> AWalker( log2LocalTBoxesPerDim );
            WeightGridList<R,d,q> partialWeightGridList( 1<<log2LocalTBoxes );
            for( size_t tIndex=0; 
                 tIndex<(1u<<log2LocalTBoxes); ++tIndex, AWalker.Walk() )
            {
                const array<size_t,d> A = AWalker.State();

                // Compute coordinates and center of this target box
                array<R,d> x0A;
                for( size_t j=0; j<d; ++j )
                    x0A[j] = myTBox.offsets[j] + (A[j]+R(1)/R(2))*wA[j];

                // Compute the interaction offset of A's parent interacting 
                // with the remaining local source boxes
                const size_t parentIOffset = ((tIndex>>d)<<(d-log2NumMerging));
                if( level <= log2N/2 )
                {
#ifdef TIMING
                    bfly::M2MTimer.Start();
#endif
                    bfly::M2M
                    ( context, plan, phase, level, x0A, p0B, wB,
                      parentIOffset, weightGridList,
                      partialWeightGridList[tIndex] );
#ifdef TIMING
                    bfly::M2MTimer.Stop();
#endif
                }
                else
                {
                    array<R,d> x0Ap;
                    array<size_t,d> globalA;
                    size_t ARelativeToAp = 0;
                    for( size_t j=0; j<d; ++j )
                    {
                        globalA[j] = A[j] +
                            (myTBoxCoords[j]<<log2LocalTBoxesPerDim[j]);
                        x0Ap[j] = tBox.offsets[j] + (globalA[j]|1)*wA[j];
                        ARelativeToAp |= (globalA[j]&1)<<j;
                    }
#ifdef TIMING
                    bfly::L2LTimer.Start();
#endif
                    bfly::L2L
                    ( context, plan, phase, level,
                      ARelativeToAp, x0A, x0Ap, p0B, wA, wB,
                      parentIOffset, weightGridList, 
                      partialWeightGridList[tIndex] );
#ifdef TIMING
                    bfly::L2LTimer.Stop();
#endif
                }
            }

            // Scatter the summation of the weights
#ifdef TIMING
            bfly::sumScatterTimer.Start();
#endif
            const size_t recvSize = 2*weightGridList.Length()*q_to_d;
            // Currently two types of planned communication are supported, as 
            // they are the only required types for transforming and inverting 
            // the transform:
            //  1) partitions of dimensions 0 -> c
            //  2) partitions of dimensions c -> d-1
            // Both 1 and 2 include partitioning 0 -> d-1, but, in general, 
            // the second category never requires packing.
            const size_t log2SubclusterSize = plan.GetLog2SubclusterSize(level);
            if( log2SubclusterSize == 0 )
            {
                MPI_Comm clusterComm = plan.GetClusterComm( level );
                SumScatter    
                ( partialWeightGridList.Buffer(), weightGridList.Buffer(),
                  recvSize, clusterComm );
            }
            else
            {
                const size_t log2NumSubclusters = 
                    log2NumMerging-log2SubclusterSize;
                const size_t numSubclusters = 1u<<log2NumSubclusters;
                const size_t subclusterSize = 1u<<log2SubclusterSize;

                const size_t numChunksPerProcess = subclusterSize;
                const size_t chunkSize = recvSize / numChunksPerProcess;
                const R* partialBuffer = partialWeightGridList.Buffer();
                vector<R> sendBuffer( recvSize<<log2NumMerging );
                for( size_t sc=0; sc<numSubclusters; ++sc )
                {
                    R* subclusterSendBuffer = 
                        &sendBuffer[sc*subclusterSize*recvSize];
                    const R* subclusterPartialBuffer = 
                        &partialBuffer[sc*subclusterSize*recvSize];
                    for( size_t p=0; p<subclusterSize; ++p )
                    {
                        R* processSend = &subclusterSendBuffer[p*recvSize];
                        for( size_t c=0; c<numChunksPerProcess; ++c )
                        {
                            memcpy 
                            ( &processSend[c*chunkSize],
                              &subclusterPartialBuffer
                              [(p+c*subclusterSize)*chunkSize],
                              chunkSize*sizeof(R) );
                        }
                    }
                }
                MPI_Comm clusterComm = plan.GetClusterComm( level );
                SumScatter
                ( &sendBuffer[0], weightGridList.Buffer(), 
                  recvSize, clusterComm );
            }
#ifdef TIMING
            bfly::sumScatterTimer.Stop();
#endif

            const vector<size_t>& tDimsToCut = plan.GetTargetDimsToCut( level );
            const vector<bool>& rightSideOfCut=plan.GetRightSideOfCut( level );
            for( size_t i=0; i<log2NumMerging; ++i )
            {
                const size_t j = tDimsToCut[i];
                myTBox.widths[j] /= 2;
                myTBoxCoords[j] *= 2;
                if( rightSideOfCut[i] )
                {
                    myTBoxCoords[j] |= 1;
                    myTBox.offsets[j] += myTBox.widths[j];
                }
                --log2LocalTBoxesPerDim[j];
                --log2LocalTBoxes;
            }
        }
        if( level==log2N/2 )
        {
#ifdef TIMING
            bfly::M2LTimer.Start();
#endif
            bfly::M2L
            ( context, plan, amplitude, phase, sBox, tBox, mySBox, myTBox,
              log2LocalSBoxes, log2LocalTBoxes, 
              log2LocalSBoxesPerDim, log2LocalTBoxesPerDim, weightGridList );
#ifdef TIMING
            bfly::M2LTimer.Stop();
#endif
        }
    }
int main(int argc, char *argv[])
{
	int myid, numprocs;
	double starttime, endtime;	
	MPI_Status s;

	int  namelen;
	char processor_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);	
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);
    MPI_Get_processor_name(processor_name,&namelen);	
	starttime = MPI_Wtime(); 	
	//F-Fusion, L-Low;
	unsigned char *colorData;

	int length=0;	
	
	unsigned char* colorBodyF,*colorBodyL;

	unsigned char* colorDataF,*colorDataL;

	unsigned char* colorHeadF, *colorHeadL;

	int HeightFusion,WidthFusion,LengthFusion;

	int HeightLow,WidthLow,LengthLow;

//	int length;
	int height,width;

	double *RGBF,*RGBL;      
    double  *colorBodyNew;	
	double RGBNewF[3];
	double RGBNewL[3];
	//  int colorLength, grayLength;
	int colorLength;
	//	double eps = 0.000001;   //控制精度要求
	
	//  RGBF=(double*)malloc(sizeof(double)*3);
	//	RGBL=(double*)malloc(sizeof(double)*3);
	
	
	printf("**********************************\n");

	fprintf(stdout,"Process %d of %d on %s\n",myid,numprocs,processor_name);

  

 if (myid==0)
	{
	
       	unsigned char* colorBody;	
		
	    ReadColorBmp("C:\\XXR_Image\\color.bmp", colorBodyF, colorHeadF,height,width, colorLength);

		ReadColorBmp("C:\\XXR_Image\\tm.bmp", colorBodyL, colorHeadL, height,width,colorLength);

		printf("colorLength=%d\n",colorLength);        

		while ((colorLength%numprocs)!=0)
		{
			numprocs--;
		}
		
		length=colorLength/numprocs;
		colorDataF=colorBodyF;
		colorDataL=colorBodyL;

		MPI_Send(&width,1,MPI_INT,0,100,MPI_COMM_WORLD);
        MPI_Recv(&width,1,MPI_INT,0,100,MPI_COMM_WORLD,&s);
		MPI_Send(&length,1,MPI_INT,0,101,MPI_COMM_WORLD);
		MPI_Recv(&length,1,MPI_INT,0,101,MPI_COMM_WORLD,&s);
      
	//	MPI_Send(colorBodyF,length*3,MPI_UNSIGNED_CHAR,0,102,MPI_COMM_WORLD);
	    colorDataF=(unsigned char* )malloc(sizeof(unsigned char)*length*3);
      /*  MPI_Recv(colorDataF,length*3,MPI_UNSIGNED_CHAR,0,102,MPI_COMM_WORLD,&s);  */

        	for (int i=0;i<3*length;i++)
				colorDataF[i]=colorBodyF[i];
       

		for (int i=1;i<=numprocs-1;i++)
		{
			MPI_Send(&width,1,MPI_INT,i,19,MPI_COMM_WORLD);
			MPI_Send(&length,1,MPI_INT,i,20,MPI_COMM_WORLD);
			MPI_Send(colorBodyF+(i*length-width)*3,(length+width)*3,MPI_UNSIGNED_CHAR,i,21,MPI_COMM_WORLD);
		   // MPI_Send(colorBodyL+i*length*3,length*3,MPI_UNSIGNED_CHAR,i,22,MPI_COMM_WORLD);
		}
        
	
	}

	else
	{
		MPI_Status s;
        MPI_Recv(&width,1,MPI_INT,0,19,MPI_COMM_WORLD,&s);
		MPI_Recv(&length,1,MPI_INT,0,20,MPI_COMM_WORLD,&s);
		colorDataF=(unsigned char* )malloc(sizeof(unsigned char)*(length+width)*3);
      //  colorDataL=(unsigned char* )malloc(sizeof(unsigned char)*length*3);
		MPI_Recv(colorDataF,(length+width)*3,MPI_UNSIGNED_CHAR,0,21,MPI_COMM_WORLD,&s);
	//	MPI_Recv(colorDataL,length*3,MPI_UNSIGNED_CHAR,0,22,MPI_COMM_WORLD,&s);
	}



    //printf("colorLength=%d\n",colorLength);

    printf("length=%d\n",length);     
	double *clarity;
	
	if (myid==0)
	{
		ComputedClarityP(colorDataF, clarity, length,width);
		
	}
	
    else 
	{
		int lengthother=length+width;
		ComputedClarityP(colorDataF, clarity, lengthother,width);
		
	}


     if (myid==0)
	{
           
		 double *Totalclarity;
		 Totalclarity=(double* )malloc(sizeof(double)*numprocs*3);
		 
		// Totalclarity=clarity;
		   for (int i=0;i<numprocs*3;i++)		
		      Totalclarity[0]=0;
		   for (int k=0;k<3;k++)		
		      Totalclarity[k]=clarity[k];

		 
		 // ComputedClarityP(colorDataF, clarity, length,width);
		 
		 for (int i=1;i<=numprocs-1;i++)		
			 MPI_Recv(Totalclarity+3*i,3,MPI_DOUBLE,i,500,MPI_COMM_WORLD,&s);
		 
		 double *clarityLast;
		 clarityLast=(double* )malloc(sizeof(double)*3);
		 for (int i=0;i<3;i++)
			 clarityLast[i]=0;
		 
		 for (int i=0;i<numprocs*3;)
		 {
			 //for (int k=0;k<3;k++)
				 clarityLast[0]=clarityLast[0]+Totalclarity[i++];
                 clarityLast[1]=clarityLast[1]+Totalclarity[i++];
                 clarityLast[2]=clarityLast[2]+Totalclarity[i++];
		 }
		 
	
		 
		 printf("%f, %f, %f\n",clarityLast[0]/colorLength,clarityLast[1]/colorLength,clarityLast[2]/colorLength);
	//	 printf("%lf, %lf, %lf\n",sqrt(clarityLast[0])/colorLength,sqrt(clarityLast[1])/colorLength,sqrt(clarityLast[2])/colorLength);

	
	}

     else MPI_Send(clarity,3,MPI_DOUBLE,0,500,MPI_COMM_WORLD);    


    

	
	endtime = MPI_Wtime();
    printf("That takes %f seconds:\n",endtime-starttime);
    MPI_Finalize();  

    return 0;
}
Beispiel #19
0
  /*!
  \brief Open an existing NetCDF file in read-only mode
  */
  void CFile::openInReadMode(void)
  {
    CContext* context = CContext::getCurrent();
    CContextServer* server = context->server;

    if (!allDomainEmpty)
    {
      StdString filename = getFileOutputName();
      StdOStringStream oss;
      oss << filename;

      if (!split_freq.isEmpty())
      {
        string splitFormat;
        if (split_freq_format.isEmpty())
        {
          if (split_freq.getValue().second != 0) splitFormat = "%y%mo%d%h%mi%s";
          else if (split_freq.getValue().minute != 0) splitFormat = "%y%mo%d%h%mi";
          else if (split_freq.getValue().hour != 0) splitFormat = "%y%mo%d%h";
          else if (split_freq.getValue().day != 0) splitFormat = "%y%mo%d";
          else if (split_freq.getValue().month != 0) splitFormat = "%y%mo";
          else splitFormat = "%y";
        }
        else splitFormat = split_freq_format;
        oss << "_" << lastSplit.getStr(splitFormat)
        << "-" << (lastSplit + split_freq.getValue() - 1 * Second).getStr(splitFormat);
      }

      bool multifile = true;
      if (!type.isEmpty())
      {
        if (type == type_attr::one_file) multifile = false;
        else if (type == type_attr::multiple_file) multifile = true;
      }
  #ifndef USING_NETCDF_PAR
      if (!multifile)
      {
        info(0) << "!!! Warning -> Using non parallel version of netcdf, switching in multiple_file mode for file : " << filename << " ..." << endl;
        multifile = true;
      }
  #endif
      if (multifile)
      {
        int commSize, commRank;
        MPI_Comm_size(fileComm, &commSize);
        MPI_Comm_rank(fileComm, &commRank);

        if (server->intraCommSize > 1)
        {
          oss << "_";
          int width = 0, n = commSize - 1;
          while (n != 0) { n = n / 10; width++; }
          if (!min_digits.isEmpty() && width < min_digits)
            width = min_digits;
          oss.width(width);
          oss.fill('0');
          oss << right << commRank;
        }
      }
      oss << ".nc";

      bool isCollective = par_access.isEmpty() || par_access == par_access_attr::collective;

      if (isOpen) data_out->closeFile();
      if (time_counter_name.isEmpty()) data_in = shared_ptr<CDataInput>(new CNc4DataInput(oss.str(), fileComm, multifile, isCollective));
      else data_in = shared_ptr<CDataInput>(new CNc4DataInput(oss.str(), fileComm, multifile, isCollective, time_counter_name));
      isOpen = true;
    }
  }
Beispiel #20
0
int main(int argc, char *argv[])
{
    int i, numprocs, rank, size, align_size;
    int skip;
    double latency = 0.0, t_start = 0.0, t_stop = 0.0;
    double timer=0.0;
    double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
    float *sendbuf, *recvbuf;
    int *recvcounts;
    int po_ret;
    size_t bufsize;

    set_header(HEADER);
    set_benchmark_name("osu_scatter");
    enable_accel_support();
    po_ret = process_options(argc, argv);

    if (po_okay == po_ret && none != options.accel) {
        if (init_accel()) {
            fprintf(stderr, "Error initializing device\n");
            exit(EXIT_FAILURE);
        }
    }

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

    switch (po_ret) {
        case po_bad_usage:
            print_bad_usage_message(rank);
            MPI_Finalize();
            exit(EXIT_FAILURE);
        case po_help_message:
            print_help_message(rank);
            MPI_Finalize();
            exit(EXIT_SUCCESS);
        case po_version_message:
            print_version_message(rank);
            MPI_Finalize();
            exit(EXIT_SUCCESS);
        case po_okay:
            break;
    }

    if(numprocs < 2) {
        if (rank == 0) {
            fprintf(stderr, "This test requires at least two processes\n");
        }

        MPI_Finalize();
        exit(EXIT_FAILURE);
    }

    if (options.max_message_size > options.max_mem_limit) {
        options.max_message_size = options.max_mem_limit;
    }

    if (allocate_buffer((void**)&recvcounts, numprocs*sizeof(int), none)) {
        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    }

    bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
    if (allocate_buffer((void**)&sendbuf, bufsize, options.accel)) {
        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    }
    set_buffer(sendbuf, options.accel, 1, bufsize);

    bufsize = sizeof(float)*((options.max_message_size/numprocs + 1)/sizeof(float));
    if (allocate_buffer((void**)&recvbuf, bufsize,
                options.accel)) {
        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    }
    set_buffer(recvbuf, options.accel, 0, bufsize);

    print_preamble(rank);

    for(size=1; size*sizeof(float)<= options.max_message_size; size *= 2) {

        if(size > LARGE_MESSAGE_SIZE) {
            skip = SKIP_LARGE;
            options.iterations = options.iterations_large;
        } else {
            skip = SKIP;
        }

        int portion=0, remainder=0;
        portion=size/numprocs;
        remainder=size%numprocs;

        for (i=0; i<numprocs; i++){
            recvcounts[i]=0;
            if(size<numprocs){ 
                if(i<size)
                    recvcounts[i]=1;
            }
            else{
                if((remainder!=0) && (i<remainder)){
                    recvcounts[i]+=1;
                }
                recvcounts[i]+=portion;
            }
        }
        MPI_Barrier(MPI_COMM_WORLD);
        
        timer=0.0;
        for(i=0; i < options.iterations + skip ; i++) {
            t_start = MPI_Wtime();

            MPI_Reduce_scatter( sendbuf, recvbuf, recvcounts, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD );
            t_stop=MPI_Wtime();
            if(i>=skip){

            timer+=t_stop-t_start;
            } 
            MPI_Barrier(MPI_COMM_WORLD);  
        }
        latency = (double)(timer * 1e6) / options.iterations;

        MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
                MPI_COMM_WORLD);
        MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
                MPI_COMM_WORLD);
        MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
                MPI_COMM_WORLD);
        avg_time = avg_time/numprocs;

        print_stats(rank, size, avg_time, min_time, max_time);
        MPI_Barrier(MPI_COMM_WORLD);
    }

    free_buffer(recvcounts, none);
    free_buffer(sendbuf, options.accel);
    free_buffer(recvbuf, options.accel);

    MPI_Finalize();

    if (none != options.accel) {
        if (cleanup_accel()) {
            fprintf(stderr, "Error cleaning up device\n");
            exit(EXIT_FAILURE);
        }
    }

    return EXIT_SUCCESS;
}
Beispiel #21
0
int main (int argc, char *argv[])
{
    SYNC        sync_type=FLUSH; 
    int         rank,nprocs;
   
    int         page_size;
    int         po_ret = po_okay;
    WINDOW      win_type=WIN_ALLOCATE;
 
    MPI_CHECK(MPI_Init(&argc, &argv));
    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &nprocs));
    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));

    if(nprocs != 2) {
        if(rank == 0) {
            fprintf(stderr, "This test requires exactly two processes\n");
        }

        MPI_CHECK(MPI_Finalize());

        return EXIT_FAILURE;
    }

    po_ret = process_options(argc, argv, &win_type, &sync_type, rank);
    switch (po_ret) {
        case po_bad_usage:
            print_help_message(rank);
            MPI_CHECK(MPI_Finalize());
            return EXIT_FAILURE;
        case po_help_message:
            print_help_message(rank);
            MPI_CHECK(MPI_Finalize());
            return EXIT_SUCCESS;
    }

    page_size = getpagesize();
    assert(page_size <= MAX_ALIGNMENT);
    
    sbuf =
        (char *) (((unsigned long) sbuf_original + (page_size - 1)) /
                page_size * page_size);
    memset(sbuf, 0, MAX_SIZE);

    rbuf =
        (char *) (((unsigned long) rbuf_original + (page_size - 1)) /
                page_size * page_size);
    memset(rbuf, 0, MAX_SIZE);

    cbuf =
        (char *) (((unsigned long) cbuf_original + (page_size - 1)) /
                page_size * page_size);
    memset(cbuf, 0, MAX_SIZE);

    print_header(rank, win_type, sync_type);

    switch (sync_type){
        case LOCK:
            run_get_acc_with_lock(rank, win_type);
            break;
        case LOCK_ALL:
            run_get_acc_with_lock_all(rank, win_type);
            break;
        case PSCW:
            run_get_acc_with_pscw(rank, win_type);
            break;
        case FLUSH_LOCAL:
            run_get_acc_with_flush_local(rank, win_type);
            break;
        case FENCE: 
            run_get_acc_with_fence(rank, win_type);
            break;
        default: 
            run_get_acc_with_flush(rank, win_type);
            break;
    }

    MPI_CHECK(MPI_Finalize());

    return EXIT_SUCCESS;
}
Beispiel #22
0
int
hypre_CommPkgCreate( hypre_CommInfo   *comm_info,
                     hypre_BoxArray   *send_data_space,
                     hypre_BoxArray   *recv_data_space,
                     int               num_values,
                     MPI_Comm          comm,
                     hypre_CommPkg   **comm_pkg_ptr )
{
   int                   ierr = 0;

   hypre_BoxArrayArray  *send_boxes;
   hypre_BoxArrayArray  *recv_boxes;
   hypre_IndexRef        send_stride;
   hypre_IndexRef        recv_stride;
   int                 **send_processes;
   int                 **recv_processes;
   int                 **send_rboxnums;
   hypre_BoxArrayArray  *send_rboxes;

   hypre_CommPkg        *comm_pkg;
   hypre_CommType       *comm_types;
   hypre_CommType       *comm_type;
   hypre_CommEntryType  *ct_entries;
   int                  *ct_loc_boxnums;
   int                  *ct_rem_boxnums;
   hypre_Box            *ct_loc_boxes;
   hypre_Box            *ct_rem_boxes;
   int                  *p_comm_types;
   int                   num_comms, num_entries, comm_bufsize;

   hypre_BoxArray       *box_array;
   hypre_Box            *box;
   hypre_BoxArray       *rbox_array;
   hypre_Box            *rbox;
   hypre_Box            *data_box;
   int                  *data_offsets;
   int                   data_offset;
                        
   int                   i, j, k, p, m, size;
   int                   num_procs, my_proc;
                        
   /*------------------------------------------------------
    *------------------------------------------------------*/

   send_boxes     = hypre_CommInfoSendBoxes(comm_info);
   recv_boxes     = hypre_CommInfoRecvBoxes(comm_info);
   send_stride    = hypre_CommInfoSendStride(comm_info);
   recv_stride    = hypre_CommInfoRecvStride(comm_info);
   send_processes = hypre_CommInfoSendProcesses(comm_info);
   recv_processes = hypre_CommInfoRecvProcesses(comm_info);
   send_rboxnums  = hypre_CommInfoSendRBoxnums(comm_info);
   send_rboxes    = hypre_CommInfoSendRBoxes(comm_info);

   MPI_Comm_size(comm, &num_procs );
   MPI_Comm_rank(comm, &my_proc );

   /*------------------------------------------------------
    * Set up various entries in CommPkg
    *------------------------------------------------------*/

   comm_pkg = hypre_CTAlloc(hypre_CommPkg, 1);

   hypre_CommPkgComm(comm_pkg)       = comm;
   hypre_CommPkgFirstSend(comm_pkg)  = 1;
   hypre_CommPkgFirstRecv(comm_pkg)  = 1;
   hypre_CommPkgNumValues(comm_pkg)  = num_values;
   hypre_CopyIndex(send_stride, hypre_CommPkgSendStride(comm_pkg));
   hypre_CopyIndex(recv_stride, hypre_CommPkgRecvStride(comm_pkg));

   /*------------------------------------------------------
    * Set up send CommType information
    *------------------------------------------------------*/

   p_comm_types = hypre_CTAlloc(int, num_procs);

   /* set send_data_offsets and send_data_space */
   data_offsets = hypre_TAlloc(int, hypre_BoxArraySize(send_data_space));
   data_offset = 0;
   hypre_ForBoxI(i, send_data_space)
      {
         data_offsets[i] = data_offset;
         data_box = hypre_BoxArrayBox(send_data_space, i);
         data_offset += hypre_BoxVolume(data_box) * num_values;
      }
Beispiel #23
0
int main()
{

	MPI_Init(NULL,NULL);

	MPI_Comm comm;
	comm = MPI_COMM_WORLD;

	int i, j, rank;
	int data1[N][N], data2[N][N];
	double start, finish, time;

	MPI_Comm_rank(comm, &rank);
	MPI_Status status;
  MPI_Datatype mcols, mrows;

 /*
 * using definitions of cols and rows as per matrix notation, not array notation
 * C stores array contiguously in columns in this convention
 */
  MPI_Type_contiguous(M*N, MPI_INT, &mcols);
  MPI_Type_vector(N, M, N,  MPI_INT, &mrows);

  MPI_Type_commit(&mcols);
  MPI_Type_commit(&mrows);

  for(i=0; i<N; ++i)
  {
    for(j=0; j<N; ++j)
    {
      data1[i][j] = j+(i*N);
      data2[i][j] = rank;
    }
  }

 start = MPI_Wtime();
	
	for(i=0; i<1; ++i)
	{
		if(rank == 0)
		{
			MPI_Ssend(&data1[0][3], 1, mrows, 1, 0, comm);
			MPI_Recv(&data1[0][3], 1, mrows, 1, 0, comm, &status);
		}

		if(rank == 1)
		{
			MPI_Recv(&data2[0][3], 1, mrows, 0, 0, comm, &status); 
      printf("I am rank 1\n");
      for(i=0; i<N; ++i)
      {
        for(j=0; j<N; ++j)
        {
          printf("%d ", data2[i][j]);
        }
        printf("\n");
      }
			MPI_Ssend(&data2[0][3], 1, mrows, 0, 0, comm);
		}
	}

	finish = MPI_Wtime();

	time = finish - start;

  printf("\n");
	if(rank == 0)
	{
    for(i=0; i<N; ++i)
    {
      for(j=0; j<N; ++j)
      {
		    printf("%d ", data1[i][j]);
      }
      printf("\n");
    }
		printf("time taken: %f\n", time);
	}

	MPI_Finalize();
}
Beispiel #24
0
int main(int argc, char *argv[])
{
    int rc, i, done, do_put_answer, work_unit_size;
    int my_world_rank, nranks, num_work_units, num_answers, provided;
    int work_prio, work_type, work_handle[PP_HANDLE_SIZE], work_len, answer_rank;
    int *num_handled_by_rank, num_handled_by_me;
    int dbgprintf_flag = 1, use_prio_for_reserve_flag = 0;
  
    int req_types[4];
    int num_types = 2;
    int type_vect[4] = {WORK,ANSWER};
    int num_type_in_req;

    char thread_type[32];
    char *work_unit_buf;

    double temptime, time_for_fake_work;
    double start_job_time, end_put_time, start_work_time, end_work_time;
    double total_work_time, total_loop_time;
    double total_reserve_time, total_get_time;

    do_put_answer      = DEFAULT_DO_PUT_ANSWER;  /* will halt by exhaustion after 5 secs */
    work_unit_size     = DEFAULT_WORK_UNIT_SIZE;
    num_work_units     = DEFAULT_NUM_WORK_UNITS;
    time_for_fake_work = DEFAULT_NSECS_FAKE_WORK;
    total_work_time = 0.0;
    total_loop_time = 0.0;
    total_reserve_time = 0.0;
    total_get_time = 0.0;

    for (i=1; i < argc; i++)
    {        
        // printf("av %s\n",argv[i]);
        if (strcmp(argv[i],"-a") == 0)
            do_put_answer = 1;
        else if (strcmp(argv[i],"-n") == 0)
            num_work_units = atoi(argv[++i]);
        else if (strcmp(argv[i],"-s") == 0)
            work_unit_size = atoi(argv[++i]);
        else if (strcmp(argv[i],"-t") == 0)
            time_for_fake_work = atof(argv[++i]);
        else
        {
            printf("st1: unrecognized cmd-line arg at %d :%s:\n",i,argv[i]);
            exit(-1);
        }
    }

    rc = MPI_Init_thread(NULL,NULL,MPI_THREAD_MULTIPLE,&provided);
    if (rc != MPI_SUCCESS)
    {
        printf("st1: MPI_Init_thread failed with rc=%d\n",rc);
        exit(-1);
    }
    switch (provided)
    {
        case MPI_THREAD_SINGLE: strcpy(thread_type,"MPI_THREAD_SINGLE"); break;
        case MPI_THREAD_FUNNELED: strcpy(thread_type,"MPI_THREAD_FUNNELED"); break;
        case MPI_THREAD_SERIALIZED: strcpy(thread_type,"MPI_THREAD_SERIALIZED"); break;
        case MPI_THREAD_MULTIPLE: strcpy(thread_type,"MPI_THREAD_MULTIPLE"); break;
        default: strcpy(thread_type,"UNKNOWN"); break;
    }
    printf("st1: MPI provides %s\n",thread_type);
    MPI_Comm_size(MPI_COMM_WORLD,&nranks);
    MPI_Comm_rank(MPI_COMM_WORLD,&my_world_rank);

    num_handled_by_me = 0;
    if (my_world_rank == 0)
        num_handled_by_rank = malloc(nranks * sizeof(int));
    else
        num_handled_by_rank = NULL;
  
    work_unit_buf = malloc(work_unit_size);
  
    rc = PP_Init(SRVR_MAX_MALLOC_AMT,num_types,type_vect);
  
    rc = MPI_Barrier( MPI_COMM_WORLD );
    start_job_time = MPI_Wtime();
    end_work_time  = MPI_Wtime();  /* dummy val until set below */
  
    if ( my_world_rank == 0 )  /* if master app, put work */
    {
        num_answers = 0;
        for (i=0; i < num_work_units; i++)
        {
            memset(work_unit_buf,'X',work_unit_size);
            if (work_unit_size >= 18)
                sprintf(work_unit_buf,"workunit %d",i);
            rc = PP_Put( work_unit_buf, work_unit_size, WORK, -1, -1, &work_handle ); 
            // dbgprintf( 1, "put work_unit %d  rc %d\n", i, rc );
        }
        // dbgprintf(1,"st1: all work submitted after %f secs\n",MPI_Wtime()-start_job_time);
        printf("st1: all work submitted after %f secs\n",MPI_Wtime()-start_job_time);
    }
    rc = MPI_Barrier( MPI_COMM_WORLD );
    end_put_time = start_work_time = MPI_Wtime();
  
    done = 0;
    while ( !done )
    {
        if (do_put_answer)
        {
            if (my_world_rank == 0)
            {
                req_types[0] = ANSWER;
                req_types[1] = WORK;
                num_types_in_req = 2;
            }
            else
            {
                req_types[0] = WORK;
                num_types_in_req = 1;
            }
        }
        else
        {
            num_types_in_req = 0;
        }
        // dbgprintf( 1, "st1: reserving work\n" );
        temptime = MPI_Wtime();
        rc = PP_FindAndReserve(num_types_in_req,req_types,&work_len,
                               &work_type,&answer_rank,work_handle);
        // dbgprintf( 1, "st1: after reserve rc %d len %d type %d\n", rc, work_len, work_type );
        if ( rc == PP_EXHAUSTION )
        {
            // dbgprintf( 1, "st1: done by exhaustion\n" );
            printf( "st1: done by exhaustion\n" );
            break;
        }
        else if ( rc == PP_NO_MORE_WORK )
        {
            // dbgprintf( 1, "st1: done by no more work\n" );
            printf( "st1: done by no more work\n" );
            break;
        }
        else if (rc < 0)
        {
            // dbgprintf( 1, "st1: ** reserve failed, rc = %d\n", rc );
            printf( "st1: ** reserve failed, rc = %d\n", rc );
            ADLB_Abort(-1);
        }
        else if (work_type == WORK) 
        {
            total_reserve_time += MPI_Wtime() - temptime;  /* only count for work */
            temptime = MPI_Wtime();
            rc = PP_Get( work_unit_buf, work_handle );
            total_get_time += MPI_Wtime() - temptime;
            if (rc == PP_NO_MORE_WORK)
            {
                // dbgprintf( 1, "st1: no more work on get_reserved\n" );
                printf( "st1: no more work on get_reserved\n" );
                break;
            }
            else   /* got good work */
            {
                /* do dummy/fake work */
                num_handled_by_me++;
                if (time_for_fake_work == 0.0)
                {
                    // dbgprintf(1,"st1: worktime 0.0\n");
                }
                else
                {
                    temptime = MPI_Wtime();
                    while (1)
                    {
                        for (i=0; i < 1000000; i++)
                            ;
                        if (MPI_Wtime()-temptime > time_for_fake_work)
                            break;
                    }
                    // dbgprintf(1,"st1: worktime %f\n",MPI_Wtime()-temptime);
                }
                if (do_put_answer)
                {
                    rc = PP_Put( NULL, 0, ANSWER, -1, 0, handle ); 
                }
            }
            end_work_time = MPI_Wtime();  /* chgs on each work unit */
        }
        else if ( work_type == ANSWER) 
        {
            num_answers++;
            // dbgprintf(1111,"GENBATCH: GOT ANSWER %d\n",num_answers);
            if (num_answers >= num_work_units)
		PP_Set_problem_done();
        }
        else
        {
            // dbgprintf( 1, "st1: ** unexpected work type %d\n", work_type );
            printf( "st1: ** unexpected work type %d\n", work_type );
            PP_Abort( -1 );
        }
    }
    rc = MPI_Barrier( MPI_COMM_WORLD );
    // total_loop_time can be misleading since we have to wait for exhaustion
    // total_loop_time = MPI_Wtime() - start_work_time;
    // dbgprintf(1,"st1: total loop time %f\n",total_loop_time);
    /****
    total_work_time = end_work_time - start_work_time;
    dbgprintf(1,"st1: num handled by me %d\n",num_handled_by_me);
    dbgprintf(1,"st1: last end_work_time %f\n",end_work_time);
    dbgprintf(1,"st1: total work_time %f ; avg work time %f\n",
            total_work_time,total_work_time/((float)num_handled_by_me));
    dbgprintf(1,"st1: total reserve time %f ; avg reserve time %f\n",
            total_reserve_time,total_reserve_time/((float)num_handled_by_me));
    dbgprintf(1,"st1: total get time %f ; avg get time %f\n",
            total_get_time,total_get_time/((float)num_handled_by_me));
    ****/
    printf("st1: num handled by me %d\n",num_handled_by_me);
    printf("st1: last end_work_time %f\n",end_work_time);
    printf("st1: total work_time %f ; avg work time %f\n",
            total_work_time,total_work_time/((float)num_handled_by_me));
    printf("st1: total reserve time %f ; avg reserve time %f\n",
            total_reserve_time,total_reserve_time/((float)num_handled_by_me));
    printf("st1: total get time %f ; avg get time %f\n",
            total_get_time,total_get_time/((float)num_handled_by_me));
    MPI_Gather(&num_handled_by_me,1,MPI_INT,
               num_handled_by_rank,1,MPI_INT,
               0,MPI_COMM_WORLD);
    if (my_world_rank == 0)
    {
        for (i=0; i < nranks; i++)
            // dbgprintf(1,"st1: num handled by rank %d : total %d  per sec %.0f\n",
            printf("st1: num handled by rank %d : total %d  per sec %.0f\n",
                   i,num_handled_by_rank[i],
                   ((float)num_handled_by_rank[i])/total_work_time);
    }

    PP_Finalize();
    // printf("st1: calling mpi_finalize\n");
    rc = MPI_Finalized(&i);
    if ( ! i)
        MPI_Finalize();
    // printf("st1: past mpi_finalize\n");
  
    return 0;
}
Beispiel #25
0
int main(int argc, char** argv)
{

  int	   Numprocs, MyRank;
  int 	   NoofCols, NoofRows, VectorSize, ScatterSize;
  int	   index, irow, icol, iproc;
  int	   Root = 0, ValidOutput = 1;
  float    **Matrix, *Buffer, *Mybuffer, *Vector,
	   *MyFinalVector, *FinalVector;
  float    *CheckResultVector;
  FILE	   *fp;
  int	   MatrixFileStatus = 1, VectorFileStatus = 1;
  double    start, end, startcomm, endcomm, commtime, totalcomm;


  /* ........MPI Initialisation .......*/

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
  MPI_Comm_size(MPI_COMM_WORLD, &Numprocs);


  if(MyRank == 0)
  {
    //creating matrix
    start = MPI_Wtime();
    VectorSize = 1024*32*4;
    NoofCols = VectorSize;
    NoofRows = VectorSize;

    Matrix = (float **)malloc(NoofRows*sizeof(float *));
    for(irow=0 ;irow<NoofRows; irow++){
      Matrix[irow] = (float *)malloc(NoofCols*sizeof(float));
      for(icol=0; icol<NoofCols; icol++) {
 	       Matrix[irow][icol] = 1;
      }
    }

    Buffer = (float *)malloc(NoofRows*NoofCols*sizeof(float));
    index = 0;
    for(irow=0; irow< NoofRows; irow++){
      for(icol=0; icol< NoofCols; icol++) {
        Buffer[index] = Matrix[irow][icol];
        index++;
      }
    }

    //creating vector

    Vector = (float*)malloc(VectorSize*sizeof(float));
    for(index = 0; index<VectorSize; index++)
         Vector[index]=1;

   }  /* end  of if myrank = 0 */


   MPI_Barrier(MPI_COMM_WORLD);

   MPI_Bcast (&MatrixFileStatus, 1, MPI_INT, Root, MPI_COMM_WORLD);
   if(MatrixFileStatus == 0) {
      if(MyRank == Root)
	 printf("Can't open input file for Matrix ..... \n");
      MPI_Finalize();
      exit(-1);
   }

   MPI_Bcast (&VectorFileStatus, 1, MPI_INT, Root, MPI_COMM_WORLD);
   if(VectorFileStatus == 0) {
      if(MyRank == Root)
	 printf("Can't open input file for Vector ..... \n");
      MPI_Finalize();
      exit(-1);
   }

   MPI_Bcast(&NoofRows, 1, MPI_INT, Root, MPI_COMM_WORLD);

   if(NoofRows < Numprocs) {
      if(MyRank == 0)
         printf("No of Rows should be more than No of Processors ... \n");
      MPI_Finalize();
      exit(0);
   }

   if(NoofRows % Numprocs != 0) {
      if(MyRank == 0)
	 printf("Matrix Can not be Striped Evenly ..... \n");
      MPI_Finalize();
      exit(0);
   }

   MPI_Bcast(&NoofCols, 1, MPI_INT, Root, MPI_COMM_WORLD);
   MPI_Bcast(&VectorSize, 1, MPI_INT, Root, MPI_COMM_WORLD);

   if(VectorSize != NoofCols){
      if(MyRank == 0){
         printf("Invalid input data..... \n");
	 printf("NoofCols should be equal to VectorSize\n");
      }
      MPI_Finalize();
      exit(0);
   }

   if(MyRank != 0)
      Vector = (float *)malloc(VectorSize*sizeof(float));
   startcomm = MPI_Wtime();
   MPI_Bcast(Vector, VectorSize, MPI_FLOAT, Root, MPI_COMM_WORLD);
   commtime = MPI_Wtime() - startcomm;

   ScatterSize = NoofRows / Numprocs;
   Mybuffer = (float *)malloc(ScatterSize * NoofCols * sizeof(float));
   startcomm = MPI_Wtime();
   MPI_Scatter( Buffer, ScatterSize * NoofCols, MPI_FLOAT, Mybuffer,
		ScatterSize * NoofCols, MPI_FLOAT, 0, MPI_COMM_WORLD);
   commtime += MPI_Wtime() - startcomm;

   MyFinalVector = (float *)malloc(ScatterSize*sizeof(float));

   for(irow = 0 ; irow < ScatterSize ; irow++) {
       MyFinalVector[irow] = 0;
       index = irow * NoofCols;
       for(icol = 0; icol < NoofCols; icol++)
	   MyFinalVector[irow] += (Mybuffer[index++] * Vector[icol]);
   }

   if( MyRank == 0)
      FinalVector = (float *)malloc(NoofRows*sizeof(float));
   startcomm = MPI_Wtime();
   MPI_Gather( MyFinalVector, ScatterSize, MPI_FLOAT, FinalVector,
	       ScatterSize, MPI_FLOAT, Root, MPI_COMM_WORLD);
   commtime += MPI_Wtime() - startcomm;

   MPI_Reduce(&commtime, &totalcomm, 1, MPI_DOUBLE, MPI_MAX, Root, MPI_COMM_WORLD);

   if( MyRank == 0) {
     end = MPI_Wtime();

     printf("Processor : %d\n",Numprocs);
     printf("Size : %d\n",VectorSize);
     printf("All Time : %f\n",end-start);
     printf("Comm Time : %f\n",totalcomm);
     //printf("Comm Time : %f\n",totalcomm);
     /*
      printf ("\n");
      printf(" --------------------------------------------------- \n");
      printf("Results of Gathering data  %d: \n", MyRank);
      printf("\n");

      for(index = 0; index < NoofRows; index++)
        printf(" FinalVector[%d] = %f \n", index, FinalVector[index]);
      printf(" --------------------------------------------------- \n");
      */
   }

   if(MyRank == 0){
      CheckResultVector = (float *)malloc(NoofRows*sizeof(float));
      for(irow = 0 ; irow < NoofRows ; irow++) {
	  CheckResultVector[irow] = 0;
	  for(icol = 0; icol < NoofCols; icol++){
	      CheckResultVector[irow] += (Matrix[irow][icol]*Vector[icol]);
	  }
	  if(fabs((double)(FinalVector[irow]-CheckResultVector[irow])) >
							    1.0E-10){
	     printf("Error %d\n",irow);
	     ValidOutput = 0;
	  }
       }
       free(CheckResultVector);
       if(ValidOutput)
	  printf("\n-------Correct Result------\n");
   }

   MPI_Finalize();

}
Beispiel #26
0
int main(int argc, char **argv)
{
    int provided;
    MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided);
    assert(provided==MPI_THREAD_SINGLE);

    int me;
    int nproc;
    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    MPI_Comm_size(MPI_COMM_WORLD,&nproc);

    int status;
    double t0,t1,t2,t3,t4,t5;
    double tt0,tt1,tt2,tt3,tt4;

    int bufSize = ( argc>1 ? atoi(argv[1]) : 1000000 );
    if (me==0) printf("%d: bufSize = %d doubles\n",me,bufSize);

    /* allocate RMA buffers for windows */
    double* m1;
    double* m2;
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &m1);
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &m2);

    /* register remote pointers */
    MPI_Win w1;
    MPI_Win w2;
    status = MPI_Win_create(m1, bufSize * sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &w1);
    status = MPI_Win_create(m2, bufSize * sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &w2);
    MPI_Barrier(MPI_COMM_WORLD);

    /* allocate RMA buffers */
    double* b1;
    double* b2;
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &b1);
    status = MPI_Alloc_mem(bufSize * sizeof(double), MPI_INFO_NULL, &b2);

    /* initialize buffers */
    int i;
    for (i=0;i<bufSize;i++) b1[i]=1.0*me;
    for (i=0;i<bufSize;i++) b2[i]=-1.0;

    status = MPI_Win_fence( MPI_MODE_NOPRECEDE | MPI_MODE_NOSTORE , w1 );
    status = MPI_Win_fence( MPI_MODE_NOPRECEDE | MPI_MODE_NOSTORE , w2);
    status = MPI_Put(b1, bufSize, MPI_DOUBLE, me, 0, bufSize, MPI_DOUBLE, w1);
    status = MPI_Put(b2, bufSize, MPI_DOUBLE, me, 0, bufSize, MPI_DOUBLE, w2);
    status = MPI_Win_fence( MPI_MODE_NOSTORE , w1);
    status = MPI_Win_fence( MPI_MODE_NOSTORE , w2);

    int target;
    int j;
    double dt,bw;
    MPI_Barrier(MPI_COMM_WORLD);
    if (me==0){
        printf("MPI_Get performance test for buffer size = %d doubles\n",bufSize);
        printf("  jump    host   target       get (s)       BW (MB/s)\n");
        printf("===========================================================\n");
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    for (j=0;j<nproc;j++){
        target = (me+j) % nproc;
        MPI_Barrier(MPI_COMM_WORLD);
        t0 = MPI_Wtime();
        status = MPI_Win_lock(MPI_LOCK_EXCLUSIVE, target, MPI_MODE_NOCHECK, w1);
        t1 = MPI_Wtime();
        status = MPI_Get(b2, bufSize, MPI_DOUBLE, target, 0, bufSize, MPI_DOUBLE, w1);
        t2 = MPI_Wtime();
        status = MPI_Win_unlock(target, w1);
        t3 = MPI_Wtime();
        for (i=0;i<bufSize;i++) assert( b2[i]==(1.0*target) );
        dt = t3 - t0;
        bw = (double)bufSize*sizeof(double)*(1e-6)/dt;
        printf("%4d     %4d     %4d       %9.6f     %9.3f\n",j,me,target,dt,bw);
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    status = MPI_Win_free(&w2);
    status = MPI_Win_free(&w1);

    status = MPI_Free_mem(b2);
    status = MPI_Free_mem(b1);

    status = MPI_Free_mem(m2);
    status = MPI_Free_mem(m1);

    MPI_Barrier(MPI_COMM_WORLD);

    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}
Beispiel #27
0
PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
{
  PetscErrorCode ierr;
  PetscInt       max,*opt,nopt,i;
  PetscBool      flg;

  PetscFunctionBegin;
  /* Require 2 tags,get the second using PetscCommGetNewTag() */
  stash->comm = comm;

  ierr = PetscCommGetNewTag(stash->comm,&stash->tag1);CHKERRQ(ierr);
  ierr = PetscCommGetNewTag(stash->comm,&stash->tag2);CHKERRQ(ierr);
  ierr = MPI_Comm_size(stash->comm,&stash->size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(stash->comm,&stash->rank);CHKERRQ(ierr);
  ierr = PetscMalloc1(2*stash->size,&stash->flg_v);CHKERRQ(ierr);
  for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;


  nopt = stash->size;
  ierr = PetscMalloc1(nopt,&opt);CHKERRQ(ierr);
  ierr = PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg);CHKERRQ(ierr);
  if (flg) {
    if (nopt == 1)                max = opt[0];
    else if (nopt == stash->size) max = opt[stash->rank];
    else if (stash->rank < nopt)  max = opt[stash->rank];
    else                          max = 0; /* Use default */
    stash->umax = max;
  } else {
    stash->umax = 0;
  }
  ierr = PetscFree(opt);CHKERRQ(ierr);
  if (bs <= 0) bs = 1;

  stash->bs         = bs;
  stash->nmax       = 0;
  stash->oldnmax    = 0;
  stash->n          = 0;
  stash->reallocs   = -1;
  stash->space_head = 0;
  stash->space      = 0;

  stash->send_waits  = 0;
  stash->recv_waits  = 0;
  stash->send_status = 0;
  stash->nsends      = 0;
  stash->nrecvs      = 0;
  stash->svalues     = 0;
  stash->rvalues     = 0;
  stash->rindices    = 0;
  stash->nprocessed  = 0;
  stash->reproduce   = PETSC_FALSE;
  stash->blocktype   = MPI_DATATYPE_NULL;

  ierr = PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,NULL,"-matstash_bts",&flg,NULL);CHKERRQ(ierr);
  if (flg) {
    stash->ScatterBegin   = MatStashScatterBegin_BTS;
    stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
    stash->ScatterEnd     = MatStashScatterEnd_BTS;
    stash->ScatterDestroy = MatStashScatterDestroy_BTS;
  } else {
    stash->ScatterBegin   = MatStashScatterBegin_Ref;
    stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
    stash->ScatterEnd     = MatStashScatterEnd_Ref;
    stash->ScatterDestroy = NULL;
  }
  PetscFunctionReturn(0);
}
Beispiel #28
0
int main(int argc,char **args)
{
  Vec            x, b, u;     /* approx solution, RHS, exact solution */
  Mat            A;           /* linear system matrix */
  KSP            ksp;         /* linear solver context */
  PC             pc;          /* preconditioner context */
  PetscReal      norm;        /* norm of solution error */
  PetscErrorCode ierr;
  PetscInt       i,n = 10,col[3],its,rstart,rend,nlocal;
  PetscScalar    neg_one = -1.0,one = 1.0,value[3];
  PetscBool      TEST_PROCEDURAL=PETSC_FALSE;

  PetscInitialize(&argc,&args,(char*)0,help);
  ierr = PetscOptionsGetInt(NULL,"-n",&n,NULL);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-procedural",&TEST_PROCEDURAL,NULL);CHKERRQ(ierr);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
         Compute the matrix and right-hand-side vector that define
         the linear system, Ax = b.
     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

  /*
     Create vectors.  Note that we form 1 vector from scratch and
     then duplicate as needed. For this simple case let PETSc decide how
     many elements of the vector are stored on each processor. The second
     argument to VecSetSizes() below causes PETSc to decide.
  */
  ierr = VecCreate(PETSC_COMM_WORLD,&x);CHKERRQ(ierr);
  ierr = VecSetSizes(x,PETSC_DECIDE,n);CHKERRQ(ierr);
  ierr = VecSetFromOptions(x);CHKERRQ(ierr);
  ierr = VecDuplicate(x,&b);CHKERRQ(ierr);
  ierr = VecDuplicate(x,&u);CHKERRQ(ierr);

  /* Identify the starting and ending mesh points on each
     processor for the interior part of the mesh. We let PETSc decide
     above. */

  ierr = VecGetOwnershipRange(x,&rstart,&rend);CHKERRQ(ierr);
  ierr = VecGetLocalSize(x,&nlocal);CHKERRQ(ierr);

  /* Create a tridiagonal matrix. See ../tutorials/ex23.c */
  ierr = MatCreate(PETSC_COMM_WORLD,&A);CHKERRQ(ierr);
  ierr = MatSetSizes(A,nlocal,nlocal,n,n);CHKERRQ(ierr);
  ierr = MatSetFromOptions(A);CHKERRQ(ierr);
  ierr = MatSetUp(A);CHKERRQ(ierr);
  /* Assemble matrix */
  if (!rstart) {
    rstart = 1;
    i      = 0; col[0] = 0; col[1] = 1; value[0] = 2.0; value[1] = -1.0;
    ierr   = MatSetValues(A,1,&i,2,col,value,INSERT_VALUES);CHKERRQ(ierr);
  }
  if (rend == n) {
    rend = n-1;
    i    = n-1; col[0] = n-2; col[1] = n-1; value[0] = -1.0; value[1] = 2.0;
    ierr = MatSetValues(A,1,&i,2,col,value,INSERT_VALUES);CHKERRQ(ierr);
  }

  /* Set entries corresponding to the mesh interior */
  value[0] = -1.0; value[1] = 2.0; value[2] = -1.0;
  for (i=rstart; i<rend; i++) {
    col[0] = i-1; col[1] = i; col[2] = i+1;
    ierr   = MatSetValues(A,1,&i,3,col,value,INSERT_VALUES);CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

  /* Set exact solution; then compute right-hand-side vector. */
  ierr = VecSet(u,one);CHKERRQ(ierr);
  ierr = MatMult(A,u,b);CHKERRQ(ierr);

  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                Create the linear solver and set various options
     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  ierr = KSPCreate(PETSC_COMM_WORLD,&ksp);CHKERRQ(ierr);
  ierr = KSPSetOperators(ksp,A,A,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);

  /*
     Set linear solver defaults for this problem (optional).
     - By extracting the KSP and PC contexts from the KSP context,
       we can then directly call any KSP and PC routines to set
       various options.
     - The following statements are optional; all of these
       parameters could alternatively be specified at runtime via
       KSPSetFromOptions();
  */
  if (TEST_PROCEDURAL) {
    /* Example of runtime options: '-pc_redundant_number 3 -redundant_ksp_type gmres -redundant_pc_type bjacobi' */
    PetscMPIInt size,rank,subsize;
    Mat         A_redundant;
    KSP         innerksp;
    PC          innerpc;
    MPI_Comm    subcomm;

    ierr = KSPGetPC(ksp,&pc);CHKERRQ(ierr);
    ierr = PCSetType(pc,PCREDUNDANT);CHKERRQ(ierr);
    ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
    ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
    if (size < 3) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Num of processes %d must greater than 2",size);
    ierr = PCRedundantSetNumber(pc,size-2);CHKERRQ(ierr);
    ierr = KSPSetFromOptions(ksp);CHKERRQ(ierr);

    /* Get subcommunicator and redundant matrix */
    ierr = KSPSetUp(ksp);CHKERRQ(ierr);
    ierr = PCRedundantGetKSP(pc,&innerksp);CHKERRQ(ierr);
    ierr = KSPGetPC(innerksp,&innerpc);CHKERRQ(ierr);
    ierr = PCGetOperators(innerpc,NULL,&A_redundant,NULL);CHKERRQ(ierr);
    ierr = PetscObjectGetComm((PetscObject)A_redundant,&subcomm);CHKERRQ(ierr); 
    ierr = MPI_Comm_size(subcomm,&subsize);CHKERRQ(ierr);
    if (subsize==1 && !rank) {
      printf("A_redundant:\n");
      ierr = MatView(A_redundant,PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr);
    }
  } else {
    ierr = KSPSetFromOptions(ksp);CHKERRQ(ierr);
  }
  
  /*  Solve linear system */
  ierr = KSPSolve(ksp,b,x);CHKERRQ(ierr);

  /* Check the error */
  ierr = VecAXPY(x,neg_one,u);CHKERRQ(ierr);
  ierr = VecNorm(x,NORM_2,&norm);CHKERRQ(ierr);
  ierr = KSPGetIterationNumber(ksp,&its);CHKERRQ(ierr);
  if (norm > 1.e-14) {
    ierr = PetscPrintf(PETSC_COMM_WORLD,"Norm of error %G, Iterations %D\n",norm,its);CHKERRQ(ierr);
  }

  /* Free work space. */
  ierr = VecDestroy(&x);CHKERRQ(ierr); ierr = VecDestroy(&u);CHKERRQ(ierr);
  ierr = VecDestroy(&b);CHKERRQ(ierr); ierr = MatDestroy(&A);CHKERRQ(ierr);
  ierr = KSPDestroy(&ksp);CHKERRQ(ierr);
  ierr = PetscFinalize();
  return 0;
}
Beispiel #29
0
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, 
         Num_procsy;      /* number of ranks in each coord direction             */
  int    Num_groupsx, 
         Num_groupsy;     /* number of blocks in each coord direction            */
  int    my_group;        /* sequence number of shared memory block              */
  int    my_group_IDx,
         my_group_IDy;    /* coordinates of block within block grid              */
  int    group_size;      /* number of ranks in shared memory group              */
  int    group_sizex,
         group_sizey;     /* number of ranks in block in each coord direction    */
  int    my_ID;           /* MPI rank                                            */
  int    my_global_IDx, 
         my_global_IDy;   /* coordinates of rank in overall rank grid            */
  int    my_local_IDx, 
         my_local_IDy;    /* coordinates of rank within shared memory block      */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  int    local_nbr[4];    /* list of synchronizing local neighbors               */
  int    num_local_nbrs;  /* number of synchronizing local neighbors             */
  int    dummy;
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in;      /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in;   /*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in;    /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in;     /*       "         "                                   */
  int    root = 0;
  long   n, width, height;/* linear global and block grid dimension              */
  int    width_rank, 
         height_rank;     /* linear local dimension                              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart_rank, 
         iend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    jstart_rank, 
         jend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    istart, iend;    /* bounds of grid block containing tile                */
  int    jstart, jend;    /* bounds of grid block containing tile                */
  DTYPE  norm,            /* L1 norm of solution                                 */
         local_norm,      /* contribution of calling rank to L1 norm             */
         reference_norm;  /* value to be matched by computed norm                */
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double local_stencil_time,/* timing parameters                                 */
         stencil_time,
         avgtime; 
  int    stencil_size;    /* number of points in stencil                         */
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */
  MPI_Status  status[8];  /* corresponding statuses                              */
  MPI_Win shm_win_in;     /* shared memory window object for IN array            */
  MPI_Win shm_win_out;    /* shared memory window object for OUT array           */
  MPI_Comm shm_comm_prep; /* preparatory shared memory communicator              */
  MPI_Comm shm_comm;      /* Shared Memory Communicator                          */
  int shm_procs;          /* # of rankes in shared domain                        */
  int shm_ID;             /* MPI rank in shared memory domain                    */
  MPI_Aint size_in;       /* size of the IN array in shared memory window        */
  MPI_Aint size_out;      /* size of the OUT array in shared memory window       */
  int size_mul;           /* one for shm_comm root, zero for the other ranks     */
  int disp_unit;          /* ignored                                             */
 
  /*******************************************************************************
  ** Initialize the MPI environment
  ********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM stencil execution on 2D grid\n");

#ifndef STAR
      printf("ERROR: Compact stencil not supported\n");
      error = 1;
      goto ENDOFTESTS;
#endif
    
    if (argc != 4){
      printf("Usage: %s  <#ranks per coherence domain><# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    } 
    if (Num_procs%group_size) {
      printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    } 

    iterations  = atoi(*++argv); 
    if (iterations < 0){
      printf("ERROR: iterations must be >= 0 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atol(*++argv);
    long nsquare = n * n;
    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);

  MPI_Bcast(&n,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now. The 
     decomposition needs to be such that shared memory groups can evenly
     tessellate the rank grid
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) {
        if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) {
          group_sizey=group_size/group_sizex;
          break;
        }
      }
      if (!(Num_procsy%group_sizey)) break;
    }
  }      


  if (my_ID == root) {
    printf("Number of ranks                 = %d\n", Num_procs);
    printf("Grid size                       = %ld\n", n);
    printf("Radius of stencil               = %d\n", RADIUS);
    printf("Tiles in x/y-direction          = %d/%d\n", Num_procsx, Num_procsy);
    printf("Tiles per shared memory domain  = %d\n", group_size);
    printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex,  group_sizey);
    printf("Type of stencil                 = star\n");
#ifdef LOCAL_BARRIER_SYNCH
    printf("Local synchronization           = barrier\n");
#else
    printf("Local synchronization           = point to point\n");
#endif
#ifdef DOUBLE
    printf("Data type                       = double precision\n");
#else
    printf("Data type                       = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations            = %d\n", iterations);
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that an SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);
  
  Num_groupsx = Num_procsx/group_sizex;
  Num_groupsy = Num_procsy/group_sizey;

  my_group = my_ID/group_size;
  my_group_IDx = my_group%Num_groupsx;
  my_group_IDy = my_group/Num_groupsx;
  my_local_IDx = my_ID%group_sizex;
  my_local_IDy = (my_ID%group_size)/group_sizex;
  my_global_IDx = my_group_IDx*group_sizex+my_local_IDx;
  my_global_IDy = my_group_IDy*group_sizey+my_local_IDy;

  /* set all neighboring ranks to -1 (no communication with those ranks) */
  left_nbr = right_nbr = top_nbr = bottom_nbr = -1;
  /* keep track of local neighbors for local synchronization             */
  num_local_nbrs = 0;

  if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) {
    right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1;
  }
  if (my_local_IDx != group_sizex-1) {
    local_nbr[num_local_nbrs++] = shm_ID + 1;
  }

  if (my_local_IDx == 0 && my_group_IDx != 0) {
    left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1;
  }
  if (my_local_IDx != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - 1;
  }

  if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) {
    top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx;
  }
  if (my_local_IDy != group_sizey-1) {
    local_nbr[num_local_nbrs++] = shm_ID + group_sizex;
  }

  if (my_local_IDy == 0 && my_group_IDy != 0) {
    bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx;
  }
  if (my_local_IDy != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - group_sizex;
  }

  /* compute amount of space required for input and solution arrays for the block,
     and also compute index sets                                                  */
  
  width = n/Num_groupsx;
  leftover = n%Num_groupsx;
  if (my_group_IDx<leftover) {
    istart = (width+1) * my_group_IDx; 
    iend = istart + width;
  }
  else {
    istart = (width+1) * leftover + width * (my_group_IDx-leftover);
    iend = istart + width - 1;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_groupsy;
  leftover = n%Num_groupsy;
  if (my_group_IDy<leftover) {
    jstart = (height+1) * my_group_IDy; 
    jend = jstart + height;
  }
  else {
    jstart = (height+1) * leftover + height * (my_group_IDy-leftover);
    jend = jstart + height - 1;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n",
           my_ID, width, height);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE);
  total_length_out = width*height*sizeof(DTYPE);

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);  
  size_in= total_length_in*size_mul; 
  MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, 
                          (void *) &in, &shm_win_in);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in);
  MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in);
  if (in == NULL){
    printf("Error allocating space for input array by group %d\n",my_group);
    error = 1;
  }
  bail_out(error);

  size_out= total_length_out*size_mul;
  MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, 
                          (void *) &out, &shm_win_out);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out);
  MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out);
  if (out == NULL){
    printf("Error allocating space for output array by group %d\n", my_group);
    error = 1;
  }
  bail_out(error);

  /* determine index set assigned to each rank                         */

  width_rank = width/group_sizex;
  leftover = width%group_sizex;
  if (my_local_IDx<leftover) {
    istart_rank = (width_rank+1) * my_local_IDx; 
    iend_rank = istart_rank + width_rank;
  }
  else {
    istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover);
    iend_rank = istart_rank + width_rank - 1;
  }
  istart_rank += istart;
  iend_rank += istart;
  width_rank = iend_rank - istart_rank + 1;   

  height_rank = height/group_sizey;
  leftover = height%group_sizey;
  if (my_local_IDy<leftover) {
    jstart_rank = (height_rank+1) * my_local_IDy; 
    jend_rank = jstart_rank + height_rank;
  }
  else {
    jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover);
    jend_rank = jstart_rank + height_rank - 1;
  }
  jstart_rank+=jstart;
  jend_rank+=jstart;
  height_rank = jend_rank - jstart_rank + 1;

  if (height_rank*width_rank==0) {
    error = 1;
    printf("Rank %d has no work to do\n", my_ID);
  }
  bail_out(error);

  /* allocate communication buffers for halo values                            */
  top_buf_out = (DTYPE *) malloc(4*sizeof(DTYPE)*RADIUS*width_rank);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  top_buf_in     = top_buf_out +   RADIUS*width_rank;
  bottom_buf_out = top_buf_out + 2*RADIUS*width_rank;
  bottom_buf_in  = top_buf_out + 3*RADIUS*width_rank;
 
  right_buf_out = (DTYPE *) malloc(4*sizeof(DTYPE)*RADIUS*height_rank);
  if (!right_buf_out) { 
    printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  right_buf_in   = right_buf_out +   RADIUS*height_rank;
  left_buf_out   = right_buf_out + 2*RADIUS*height_rank;
  left_buf_in    = right_buf_out + 3*RADIUS*height_rank;

    /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;
  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  norm = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);
  /* intialize the input and output arrays                                     */
  for (j=jstart_rank; j<=jend_rank; j++) for (i=istart_rank; i<=iend_rank; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* LOAD/STORE FENCE */
  MPI_Win_sync(shm_win_in);
  MPI_Win_sync(shm_win_out);
  MPI_Barrier(shm_comm); 

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_stencil_time = wtime();
    }

    /* need to fetch ghost point data from neighbors in y-direction                 */
    if (top_nbr != -1) {
      MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101,
                MPI_COMM_WORLD, &(request[1]));
      for (kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        top_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, 
                MPI_COMM_WORLD, &(request[0]));
    }

    if (bottom_nbr != -1) {
      MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, 
                MPI_COMM_WORLD, &(request[3]));
      for (kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        bottom_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101,
 	  MPI_COMM_WORLD, &(request[2]));
      }

    if (top_nbr != -1) {
      MPI_Wait(&(request[0]), &(status[0]));
      MPI_Wait(&(request[1]), &(status[1]));
      for (kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = top_buf_in[kk++];
      }
    }

    if (bottom_nbr != -1) {    
      MPI_Wait(&(request[2]), &(status[2]));
      MPI_Wait(&(request[3]), &(status[3]));
      for (kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) 
      for (i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = bottom_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* need to fetch ghost point data from neighbors in x-direction                 */
    if (right_nbr != -1) {
      MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010,
                MPI_COMM_WORLD, &(request[1+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=iend_rank-RADIUS+1; i<=iend_rank; i++) {
        right_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, 
                MPI_COMM_WORLD, &(request[0+4]));
    }

    if (left_nbr != -1) {
      MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, 
                MPI_COMM_WORLD, &(request[3+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=istart_rank; i<=istart_rank+RADIUS-1; i++) {
        left_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010,
                MPI_COMM_WORLD, &(request[2+4]));
    }

    if (right_nbr != -1) {
      MPI_Wait(&(request[0+4]), &(status[0+4]));
      MPI_Wait(&(request[1+4]), &(status[1+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=iend_rank+1; i<=iend_rank+RADIUS; i++) {
        IN(i,j) = right_buf_in[kk++];
      }
    }

    if (left_nbr != -1) {
      MPI_Wait(&(request[2+4]), &(status[2+4]));
      MPI_Wait(&(request[3+4]), &(status[3+4]));
      for (kk=0,j=jstart_rank; j<=jend_rank; j++) 
      for (i=istart_rank-RADIUS; i<=istart_rank-1; i++) {
        IN(i,j) = left_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* Apply the stencil operator */
    for (j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
      for (i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_out);

#ifdef LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it
#else
    for (i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, status);
#endif

    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart_rank; j<=jend_rank; j++) 
    for (i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0;

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

#ifdef LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it
#else
    for (i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, status);
#endif
 
  } /* end of iterations                                                   */
 
  local_stencil_time = wtime() - local_stencil_time;
  MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);
  
  /* compute L1 norm in parallel                                                */
  local_norm = (DTYPE) 0.0;
  for (j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
    for (i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
      local_norm += (DTYPE)ABS(OUT(i,j));
    }
  }
 
  MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD);
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm, reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm);
#endif
    }
  }
  bail_out(error);
 
  MPI_Win_unlock_all(shm_win_in);
  MPI_Win_unlock_all(shm_win_out);
  MPI_Win_free(&shm_win_in);
  MPI_Win_free(&shm_win_out);

  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 
  MPI_Finalize();
  exit(EXIT_SUCCESS);
}
Beispiel #30
0
static
void
paralellJacobi(struct calculation_arguments const* arguments, struct calculation_results *results, struct options const* options)
{
    int rank, num_procs;
    MPI_Status status;
    
    
    MPI_Init(arguments->argc, arguments->argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
    
    if(rank == 0)
        calculateMatrices(arguments, num_procs);
    
    int i, j;                                   /* local variables for loops  */
    int m1, m2;                                 /* used as indices for old and new matrices       */
    double star;                                /* four times center value minus 4 neigh.b values */
    double residuum;                            /* residuum of current iteration                  */
    double maxresiduum;                         /* maximum residuum value of a slave in iteration */
    
    int const N = arguments->N;
    double const h = arguments->h;
    
    double pih = 0.0;
    double fpisin = 0.0;
    
    int term_iteration = options->term_iteration;
    
    int sendRec_before = (rank - 1) % num_procs;
    int sendRec_after = rank + 1 % num_procs;
    m1 = 0;
    m2 = 1;
    
    
    if (options->inf_func == FUNC_FPISIN)
    {
        pih = PI * h;
        fpisin = 0.25 * TWO_PI_SQUARE * h * h;
    }
    
    while (term_iteration > 0)
    {
        double** Matrix_Out = getMyMatrix(arguments->start[rank], arguments->moveSize[rank], m1, arguments);
        double** Matrix_In  = getMyMatrix(arguments->start[rank], arguments->moveSize[rank], m2, arguments);
        
        maxresiduum = 0;
        
        
        if ((rank % 2) == 0)
        {
            if (rank == 0)
            {
                /*TODO: Sendplätze überprüfen, Sendetypen überprüfen */
                MPI_Send(&Matrix_In[arguments->moveSize[rank] - 1], 1, MPI_DOUBLE, sendRec_after, TAG_SEND, MPI_COMM_WORLD);
                MPI_Recv(&Matrix_In[arguments->moveSize[rank]], 1, MPI_DOUBLE,sendRec_after, TAG_RECV, MPI_COMM_WORLD, NULL);
                
            }
            else if (rank == num_procs -1)
            {
                MPI_Send(&Matrix_In[arguments->moveSize[rank] - 1], 1, MPI_DOUBLE, sendRec_before, TAG_SEND, MPI_COMM_WORLD);
                MPI_Recv(&Matrix_In[0], 1, MPI_DOUBLE,  sendRec_before, TAG_RECV, MPI_COMM_WORLD, NULL);
            }
            else
            {
                MPI_Send(&Matrix_In[1], 1, MPI_DOUBLE, sendRec_before, TAG_SEND, MPI_COMM_WORLD);
                MPI_Send(&Matrix_In[arguments->moveSize[rank] - 1], 1, MPI_DOUBLE, sendRec_after, TAG_SEND, MPI_COMM_WORLD);
                MPI_Recv(&Matrix_In[0], 1, MPI_DOUBLE,  sendRec_after, TAG_RECV, MPI_COMM_WORLD, NULL);
                MPI_Recv(&Matrix_In[arguments->moveSize[rank]], 1, MPI_DOUBLE,  sendRec_before, TAG_RECV, MPI_COMM_WORLD, NULL);
            }
        }
        else
        {
            if (rank == num_procs - 1)
            {
                MPI_Recv(&Matrix_In[0], 1, MPI_DOUBLE,  sendRec_before, TAG_RECV, MPI_COMM_WORLD, NULL);
                MPI_Send(&Matrix_In[arguments->moveSize[rank] - 1], 1, MPI_DOUBLE, sendRec_before, TAG_SEND, MPI_COMM_WORLD);
            }
            else
            {
                MPI_Recv(&Matrix_In[0], 1, MPI_DOUBLE,  sendRec_after, TAG_RECV, MPI_COMM_WORLD, NULL);
                MPI_Recv(&Matrix_In[arguments->moveSize[rank]], 1, MPI_DOUBLE,  sendRec_before, TAG_RECV, MPI_COMM_WORLD, NULL);
                MPI_Send(&Matrix_In[1], 1, MPI_DOUBLE, sendRec_before, TAG_SEND, MPI_COMM_WORLD);
                MPI_Send(&Matrix_In[arguments->moveSize[rank] - 1], 1, MPI_DOUBLE, sendRec_after, TAG_SEND, MPI_COMM_WORLD);
            }
        }
        
        
        
        /* over all rows */
        for (i = 1; i < arguments->moveSize[rank]; i++)
        {
            double fpisin_i = 0.0;
            
            if (options->inf_func == FUNC_FPISIN)
            {
                fpisin_i = fpisin * sin(pih * (double)i);
            }
            
            /* over all columns */
            for (j = 1; j < N; j++)
            {
                star = 0.25 * (Matrix_In[i-1][j] + Matrix_In[i][j-1] + Matrix_In[i][j+1] + Matrix_In[i+1][j]);
                
                if (options->inf_func == FUNC_FPISIN)
                {
                    star += fpisin_i * sin(pih * (double)j);
                }
                
                if (options->termination == TERM_PREC || term_iteration == 1)
                {
                    residuum = Matrix_In[i][j] - star;
                    residuum = (residuum < 0) ? -residuum : residuum;
                    maxresiduum = (residuum < maxresiduum) ? maxresiduum : residuum;
                }
                
                Matrix_Out[i][j] = star;
            }
        }
        
        results->stat_iteration++;
        results->stat_precision = maxresiduum;
        
        
        /* exchange m1 and m2 */
        i = m1;
        m1 = m2;
        m2 = i;
        
        
        
        /* check for stopping calculation, depending on termination method */
        if (options->termination == TERM_PREC)
        {
            if (maxresiduum < options->term_precision)
            {
                term_iteration = 0;
            }
        }
        else if (options->termination == TERM_ITER)
        {
            term_iteration--;
        }
    }
    
    DisplayMatrixMPI(arguments, results, options, rank, num_procs, arguments->start[rank], (arguments->moveSize[rank] + arguments->start[rank]));
    
    results->m = m2;
}