Example #1
0
	bool mat_sqrt(const complex_vec_t& matrix, complex_vec_t& result) {
		result.clear();
		for(complex_vec_t::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) {
			result.push_back(sqrt(*i));
		} // for
		return true;
	} // mat_sqrt()
Example #2
0
	bool mat_mul(complex_t scalar, const complex_vec_t& matrix, complex_vec_t& result) {
		result.clear();
		for(complex_vec_t::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) {
			result.push_back((*i) * scalar);
		} // for
		return true;
	} // mat_mul()
Example #3
0
 bool AnalyticFormFactor::mat_sinc(unsigned int x_size, unsigned int y_size, unsigned int z_size,
                   const complex_vec_t& matrix, complex_vec_t& result) {
   result.clear();
   for(std::vector<complex_t>::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) {
     result.push_back(sinc(*i));
   } // for
   return true;
 } // AnalyticFormFactor::mat_sinc()
Example #4
0
 bool AnalyticFormFactor::mat_fq_inv(unsigned int x_size, unsigned int y_size, unsigned int z_size,
                   const complex_vec_t& matrix, real_t y, complex_vec_t& result) {
   result.clear();
   for(complex_vec_t::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) {
     result.push_back(fq_inv(*i, y));
   } // for
   return true;
 } // AnalyticFormFactor::mat_fq_inv()
Example #5
0
	/**
	 * computes element-by-element division of two matrices (matrix1 / matrix2)	into result
	 */
	bool mat_dot_div(unsigned int nx1, unsigned int ny1, unsigned int nz1,
						const complex_vec_t& matrix1,
						unsigned int nx2, unsigned int ny2, unsigned int nz2,
						const complex_vec_t& matrix2,
						complex_vec_t& result) {
		if(nx1 != nx2 || ny1 != ny2 || nz1 != nz2 || matrix1.size() != matrix2.size()) {
			std::cerr << "error: matrix sizes are not the same for dot division operation"
						<< std::endl;
			return false;
		} // if
		result.clear();
		complex_vec_t::const_iterator i1 = matrix1.begin();
		complex_vec_t::const_iterator i2 = matrix2.begin();
		for(; i1 != matrix1.end(); ++ i1, ++ i2) {
			result.push_back((*i1) / (*i2));
		} // for
		return true;
	} // mat_dot_div()
Example #6
0
	/**
	 * computes element-by-element product of two matrices into result
	 */
	bool mat_dot_prod(unsigned int x1_size, unsigned int y1_size, unsigned int z1_size,
						const complex_vec_t& matrix1,
						unsigned int x2_size, unsigned int y2_size, unsigned int z2_size,
						const complex_vec_t& matrix2,
						complex_vec_t& result) {
		if(x1_size != x2_size || y1_size != y2_size || z1_size != z2_size
				|| matrix1.size() != matrix2.size()) {
			std::cerr << "error: matrix sizes are not the same for dot product operation" << std::endl;
			return false;
		} // if
		result.clear();
		complex_vec_t::const_iterator i1 = matrix1.begin();
		complex_vec_t::const_iterator i2 = matrix2.begin();
		for(; i1 != matrix1.end(); ++ i1, ++ i2) {
			result.push_back((*i1) * (*i2));
		} // for
		return true;
	} // mat_dot_prod()
Example #7
0
  /**
   * Function to gather partial FF arrays from all processes to construct the final FF.
   * This is a bottleneck for large num procs ...
   */
  bool NumericFormFactor::construct_ff(int p_nqx, int p_nqy, int p_nqz,
                      int nqx, int nqy, int nqz,
                      int p_y, int p_z,
                      #ifdef FF_NUM_GPU
                        cucomplex_t* p_ff,
                      #else
                        complex_t* p_ff,
                      #endif
                      complex_vec_t& ff,
                      #ifdef USE_MPI
                        woo::MultiNode& world_comm, std::string comm_key,
                      #endif
                      real_t& mem_time, real_t& comm_time) {
    real_t mem_start = 0, mem_end = 0, comm_start = 0, comm_end = 0;
    woo::BoostChronoTimer memtimer, commtimer;
    mem_time = 0; comm_time = 0;

    #ifdef USE_MPI
      bool master = world_comm.is_master(comm_key);
      int size = world_comm.size(comm_key);
      int rank = world_comm.rank(comm_key);
    #else
      bool master = true;
      int size = 1;
      int rank = 0;
    #endif

    memtimer.start();
  
    int local_qpoints = p_nqx * p_nqy * p_nqz;
    unsigned long int total_qpoints = nqx * nqy * nqz;
  
    // process 0 creates the main ff, and collects computed p_ff from all others (just use gather)
    ff.clear();
    #ifdef FF_NUM_GPU
      cucomplex_t* all_ff = NULL;    // TODO: improve this ...
    #else
      complex_t* all_ff = NULL;
    #endif
    if(master) {
      ff.reserve(total_qpoints);
      ff.assign(total_qpoints, complex_t(0.0, 0.0));
      #ifdef FF_NUM_GPU
        all_ff = new (std::nothrow) cucomplex_t[total_qpoints];
      #else
        all_ff = new (std::nothrow) complex_t[total_qpoints];
      #endif
    } // if
  
    mem_time += memtimer.elapsed_msec();

    int *recv_p_nqy = new (std::nothrow) int[p_y]();
    recv_p_nqy[0] = p_nqy;
    int *off_p_nqy = new (std::nothrow) int[p_y]();
    off_p_nqy[0] = 0;

    #ifdef FF_NUM_GPU
      cucomplex_t *ff_buffer = new (std::nothrow) cucomplex_t[total_qpoints];
    #else
      complex_t *ff_buffer = new (std::nothrow) complex_t[total_qpoints];
    #endif
    if(ff_buffer == NULL) {
      std::cerr << "error: failed to allocate memory for ff buffer" << std::endl;
      return false;
    } // if

    #ifdef USE_MPI
      // construct stuff for gatherv
      int *recv_counts = new (std::nothrow) int[size]();
      int *displs = new (std::nothrow) int[size]();

      commtimer.start();

      //comm.Allgather(&local_qpoints, 1, MPI::INT, recv_counts, 1, MPI::INT);
      world_comm.allgather(comm_key, &local_qpoints, 1, recv_counts, 1);

      commtimer.stop();
      comm_time += commtimer.elapsed_msec();
      memtimer.start();

      displs[0] = 0;
      for(int i = 1; i < size; ++ i) {
        displs[i] = displs[i - 1] + recv_counts[i - 1];
      } // for
      complex_t *cast_p_ff, *cast_ff;
      #ifdef FF_NUM_GPU
        cast_p_ff = reinterpret_cast<complex_t*>(p_ff);
        cast_ff = reinterpret_cast<complex_t*>(ff_buffer);
      #else
        cast_p_ff = p_ff;
        cast_ff = ff_buffer;
      #endif

      memtimer.stop();
      mem_time += memtimer.elapsed_msec();
  
      commtimer.start();

      world_comm.gatherv(comm_key, cast_p_ff, local_qpoints, cast_ff, recv_counts, displs);
  
      world_comm.gather("ff_num_col_comm", &p_nqy, 1, recv_p_nqy, 1);
    
      commtimer.stop();
      comm_time += commtimer.elapsed_msec();

      for(int i = 1; i < p_y; ++ i) off_p_nqy[i] = off_p_nqy[i - 1] + recv_p_nqy[i - 1];
    #else
      #ifdef FF_NUM_GPU
        memcpy(ff_buffer, p_ff, total_qpoints * sizeof(cucomplex_t));
      #else
        memcpy(ff_buffer, p_ff, total_qpoints * sizeof(complex_t));
      #endif
    #endif // USE_MPI
  
    memtimer.start();

    // move all the data to correct places
    if(rank == 0) {
      unsigned long int ff_index = 0;
      for(int i_nqz = 0; i_nqz < nqz; ++ i_nqz) {
        for(int i_py = 0; i_py < p_y; ++ i_py) {
          unsigned long int ffb_index = nqx * (i_nqz * recv_p_nqy[i_py] +
                              nqz * off_p_nqy[i_py]);
          #ifdef FF_NUM_GPU
            memcpy(&all_ff[ff_index], &ff_buffer[ffb_index],
                nqx * recv_p_nqy[i_py] * sizeof(cucomplex_t));
          #else
            memcpy(&all_ff[ff_index], &ff_buffer[ffb_index],
                nqx * recv_p_nqy[i_py] * sizeof(complex_t));
          #endif
          ff_index += nqx * recv_p_nqy[i_py];
        } // for i_py
      } // for i_nqz
      // put into the final ff buffer
      #ifdef FF_NUM_GPU
        ff.assign(reinterpret_cast<complex_t*>(all_ff),
              reinterpret_cast<complex_t*>(all_ff + total_qpoints));
      #else
        ff.assign(all_ff, all_ff + total_qpoints);
      #endif
    } // if
  
    delete[] ff_buffer;
    #ifdef USE_MPI
      delete[] displs;
      delete[] recv_counts;
    #endif
    delete[] off_p_nqy;
    delete[] recv_p_nqy;
    delete[] all_ff;

    memtimer.stop();
    mem_time += memtimer.elapsed_msec();

    return true;
  } // NumericFormFactor::construct_ff()
Example #8
0
  /**
   * main host function
   */
  bool NumericFormFactor::compute(const char* filename, complex_vec_t& ff,
                  vector3_t& rot1, vector3_t& rot2, vector3_t& rot3
                  #ifdef USE_MPI
                    , woo::MultiNode& world_comm, std::string comm_key
                  #endif
                  ) {
    real_t comp_start = 0.0, comp_end = 0.0, comm_start = 0.0, comm_end = 0.0;
    real_t mem_start = 0.0, mem_end = 0.0;
    real_t comp_time = 0.0, comm_time = 0.0, mem_time = 0.0, kernel_time = 0.0, red_time = 0.0;
    real_t total_start = 0.0, total_end = 0.0, total_time = 0.0;

    woo::BoostChronoTimer maintimer, computetimer;
    woo::BoostChronoTimer commtimer, memtimer;

    unsigned int nqx = QGrid::instance().nqx();
    unsigned int nqy = QGrid::instance().nqy();
    unsigned int nqz = QGrid::instance().nqz_extended();

    #ifdef USE_MPI
      bool master = world_comm.is_master(comm_key);
      commtimer.start();
      world_comm.barrier(comm_key);
      commtimer.stop();
      comm_time += commtimer.elapsed_msec();
    #else
      bool master = true;
    #endif

  
    // warning: all procs read the shape file!!!!
    // TODO: improve to parallel IO, or one proc reading and sending to all ...
//    #ifndef __SSE3__
      real_vec_t shape_def;
//    #else
//      #ifdef USE_GPU
//        real_vec_t shape_def;
//      #else
//        real_t* shape_def = NULL;
//      #endif
//    #endif
    // use the new file reader instead ...
    unsigned int num_triangles = read_shapes_file(filename, shape_def);
            // TODO ... <--- sadly all procs read this! IMPROVE!!!
  
    // TODO: temporary ... remove ...
    std::vector<short int> axes(4);      // axes[i] = j
                        // i: x=0 y=1 z=2
                        // j: 0=a 1=b 2=c
    #ifndef AXIS_ROT
      axes[0] = 0; axes[1] = 1; axes[2] = 2;  // default values
    #else
      find_axes_orientation(shape_def, axes);
    #endif

    #ifdef USE_MPI
      int num_procs = world_comm.size(comm_key);
      int rank = world_comm.rank(comm_key);
    #endif

    if(master) {
      std::cout << "-- Numerical form factor computation ..." << std::endl
            << "**        Using input shape file: " << filename << std::endl
            << "**     Number of input triangles: " << num_triangles << std::endl
            << "**  Q-grid resolution (q-points): " << nqx * nqy * nqz << std::endl
                  << "**               NQX x NQY x NQZ: "
            << nqx << " x " << nqy << " x " << nqz << std::endl
            #ifdef USE_MPI
              << "** Number of processes requested: " << num_procs << std::endl
            #endif
            << std::flush;
    } // if
    if(num_triangles < 1) {
      std::cerr << "error: no triangles found in specified definition file" << std::endl;
      return false;
    } // if
  
    #ifdef USE_MPI
      // decompose along y and z directions into blocks
      int p_y = std::floor(sqrt((real_t) num_procs));  // some procs may be idle ...
      int p_z = num_procs / p_y;
    
      int p_nqx = nqx;
      int p_nqy = nqy / p_y + (((rank / p_z) < (int)nqy % p_y) ? 1 : 0);
      int p_nqz = nqz / p_z + (((rank % p_z) < (int)nqz % p_z) ? 1 : 0);

      commtimer.start();

      int idle = 0;
      if(world_comm.rank(comm_key) >= p_y * p_z) idle = 1;
      std::string real_world("ff_num_real_world");
      world_comm.split(real_world, comm_key, idle);

      commtimer.stop();
      comm_time += commtimer.elapsed_msec();
    #else
      int p_y = 1, p_z = 1;
      int p_nqx = nqx;
      int p_nqy = nqy;
      int p_nqz = nqz;
    #endif // USE_MPI
  
    #ifdef FINDBLOCK
      int block_x = 0, block_y = 0, block_z = 0, block_t = 0;
      int block_x_max = 0, block_y_max = 0, block_z_max = 0, block_t_max = 0;
      block_x_max = (nqx < 400) ? nqx : 400;
      block_y_max = (nqy < 400) ? nqy : 400;
      block_z_max = (nqz < 400) ? nqz : 400;
      block_t_max = (num_triangles < 2500) ? num_triangles : 2500;
      block_t = block_t_max;
      for(block_t = block_t_max; block_t > std::min(99, block_t_max - 1); block_t -= 100) {
      for(block_x = block_x_max; block_x > std::min(3, block_x_max - 1); block_x -= 2) {
      for(block_y = block_y_max; block_y > std::min(3, block_y_max - 1); block_y -= 2) {
      for(block_z = block_z_max; block_z > std::min(3, block_z_max - 1); block_z -= 2) {
    #endif
    
    maintimer.start();

    #ifdef USE_MPI
    if(world_comm.rank(comm_key) < p_y * p_z) {    // only the non-idle processors
      bool master = world_comm.is_master(real_world);
      if(master) {
        std::cout << "++  Number of MPI processes used: "
              << world_comm.size(real_world) << std::endl
              << "++                 MPI grid size: 1 x " << p_y << " x " << p_z
              << std::endl << std::flush;
      } // if

      commtimer.start();

      int rank = world_comm.rank(real_world);
      int size = world_comm.size(real_world);

      // create row-wise and column-wise communicators
      int row = rank / p_z, col = rank % p_z;
      world_comm.split("ff_num_row_comm", real_world, row);
      world_comm.split("ff_num_col_comm", real_world, col);

      // perform MPI scan operation to compute y_offset and z_offset

      unsigned int y_offset = 0, z_offset = 0;
      world_comm.scan_sum("ff_num_col_comm", p_nqy, y_offset);
      world_comm.scan_sum("ff_num_row_comm", p_nqz, z_offset);

      commtimer.stop();
      comm_time += commtimer.elapsed_msec();
  
      y_offset -= p_nqy;
      z_offset -= p_nqz;
    #else
      master = true;
      unsigned int y_offset = 0, z_offset = 0;
      int rank = 0;
      int size = 1;
    #endif // USE_MPI

      memtimer.start();

      // FIXME: this is a yucky temporary fix ... fix properly ...
      real_t* qx = new (std::nothrow) real_t[nqx]();
      real_t* qy = new (std::nothrow) real_t[nqy]();
      #ifdef FF_NUM_GPU
        cucomplex_t* qz = new (std::nothrow) cucomplex_t[nqz]();
      #else
        complex_t* qz = new (std::nothrow) complex_t[nqz]();
      #endif
      // create qy_and qz using qgrid instance
      for(unsigned int i = 0; i < nqx; ++ i) {
        qx[i] = QGrid::instance().qx(i);
      } // for
      for(unsigned int i = 0; i < nqy; ++ i) {
        qy[i] = QGrid::instance().qy(i);
      } // for
      for(unsigned int i = 0; i < nqz; ++ i) {
        #ifdef FF_NUM_GPU
          qz[i].x = QGrid::instance().qz_extended(i).real();
          qz[i].y = QGrid::instance().qz_extended(i).imag();
        #else
          qz[i] = QGrid::instance().qz_extended(i);
        #endif
      } // for
      
      #ifdef USE_MPI
        // create p_ff buffers  <----- TODO: IMPROVE for all procs!!!
        real_t *p_qy = NULL;
        p_qy = new (std::nothrow) real_t[p_nqy]();
        if(p_qy == NULL) { return 0; }
        memcpy(p_qy, (void*) (qy + y_offset), p_nqy * sizeof(real_t));
        #ifdef FF_NUM_GPU
          cucomplex_t *p_qz = NULL;
          p_qz = new (std::nothrow) cucomplex_t[p_nqz]();
          if(p_qz == NULL) { delete[] p_qy; return 0; }
          memcpy(p_qz, (void*) (qz + z_offset), p_nqz * sizeof(cucomplex_t));
        #else // TODO: avoid the following ...
          complex_t *p_qz = NULL;
          p_qz = new (std::nothrow) complex_t[p_nqz]();
          if(p_qz == NULL) { delete[] p_qy; return 0; }
          memcpy(p_qz, (void*) (qz + z_offset), p_nqz * sizeof(complex_t));
        #endif  // FF_NUM_GPU
      #else  // no MPI
        real_t *p_qy = qy;
        #ifdef FF_NUM_GPU
          cucomplex_t *p_qz = qz;
        #else
          complex_t *p_qz = qz;
        #endif // FF_NUM_GPU
      #endif  // USE_MPI
  
      memtimer.stop();
      mem_time += memtimer.elapsed_msec();
    
      // compute local

      #ifdef FF_NUM_GPU
        cucomplex_t *p_ff = NULL;
      #else
        complex_t *p_ff = NULL;
      #endif
  
      computetimer.reset();
      computetimer.start();

      unsigned int ret_numtriangles = 0;

      real_t temp_mem_time = 0.0;

      #ifdef FF_NUM_GPU  // use GPU
        #ifdef FF_NUM_GPU_FUSED
          ret_numtriangles = gff_.compute_form_factor_kb_fused(rank, shape_def, axes, p_ff,
                        qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, 3,
                        rot_,
                        kernel_time, red_time, temp_mem_time
                        #ifdef FINDBLOCK
                          , block_x, block_y, block_z, block_t
                        #endif
                        );
        #else
          ret_numtriangles = gff_.compute_form_factor_db(rank, shape_def, axes, p_ff,
                        qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz,
                        rot_,
                        kernel_time, red_time, temp_mem_time
                        #ifdef FINDBLOCK
                          , block_x, block_y, block_z, block_t
                        #endif
                        );
        #endif
/*      #elif defined USE_MIC  // use MIC
        #ifndef FF_NUM_MIC_KB
          ret_numtriangles = mff_.compute_form_factor_db(rank, shape_def, p_ff,
                        qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz,
                        rot_,
                        kernel_time, red_time, temp_mem_time
                        #ifdef FINDBLOCK
                          , block_x, block_y, block_z, block_t
                        #endif
                        );
        #else
          ret_numtriangles = mff_.compute_form_factor_kb(rank, shape_def,
                        num_triangles,
                        p_ff,
                        qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, 3,
                        rot_,
                        kernel_time, red_time, temp_mem_time
                        #ifdef FINDBLOCK
                          , block_x, block_y, block_z, block_t
                        #endif
                        );
        #endif */
      #else  // use only CPU
        ret_numtriangles = cff_.compute_form_factor(rank, shape_def,
//                        #ifdef __SSE3__
//                          num_triangles,
//                        #endif
                        p_ff,
                        qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz,
                        rot_,
                        kernel_time, red_time, temp_mem_time
                        #ifdef FINDBLOCK
                          , block_x, block_y, block_z, block_t
                        #endif
                        );
      #endif

      computetimer.stop();
      comp_time += computetimer.elapsed_msec();
      mem_time += (temp_mem_time / 1000);
  
      // gather everything on proc 0
      if(ret_numtriangles > 0) {
        real_t temp_mem_time = 0.0, temp_comm_time = 0.0;
        construct_ff(p_nqx, p_nqy, p_nqz, nqx, nqy, nqz, p_y, p_z, p_ff, ff,
                #ifdef USE_MPI
                  world_comm, real_world,
                #endif
                temp_mem_time, temp_comm_time);
        mem_time += temp_mem_time;
        comm_time += temp_comm_time;
      } // if
  
      /*if(rank == 0) {
        write_slice_to_file(ff, nqx, nqy, nqz, filename, 0, 0);  // x = 0, y = 1, z = 2
                            // only slice along x implemented for now
      } // if*/
  
      #ifdef USE_MPI
        world_comm.barrier(real_world);
      #endif
  
      memtimer.start();
      
      #ifdef FINDBLOCK
        ff.clear();
      #endif
      if(p_ff != NULL) delete[] p_ff;
      #ifdef USE_MPI
        delete[] p_qz;
        delete[] p_qy;
      #endif
      delete[] qz;
      delete[] qy;
      delete[] qx;

      memtimer.stop();
      maintimer.stop();
  
      total_time = maintimer.elapsed_msec();
      mem_time += memtimer.elapsed_msec();
  
      if(master) {
        #ifdef TIME_DETAIL_1
          std::cout
            << "**                FF kernel time: " << kernel_time << " ms." << std::endl
            << "**               FF compute time: " << computetimer.elapsed_msec() << " ms."
            << std::endl
            << "**         FF memory and IO time: " << mem_time * 1000 << " ms." << std::endl
            << "**            Communication time: " << comm_time * 1000 << " ms." << std::endl
            << "**                 Total FF time: " << maintimer.elapsed_msec() << " ms."
            << std::endl << std::flush;
        #endif // TIME_DETAIL_1

        double mflop = 0.0; real_t gflops = 0.0;

        #ifdef USE_GPU
          // flop count for GPU
          //mflop = (double) nqx * nqy * nqz * (42 * num_triangles + 2) / 1000000;
          mflop = (double) nqx * nqy * nqz * (69 * num_triangles + 52) / 1000000;
        #elif defined USE_MIC
          // flop count for MIC
          //mflop = (double) nqx * nqy * nqz * (78 * num_triangles + 18) / 1000000;
          mflop = (double) nqx * nqy * nqz * (111 * num_triangles + 50) / 1000000;
        #elif defined INTEL_SB_AVX
          // flop count for Sandy Bridge with AVX
          // TODO: recount flops ...
          mflop = (double) nqx * nqy * nqz * (85 * num_triangles + 16) / 1000000;
        #else
          // flop count for SSE3 CPU (hopper)
          // TODO: recount flops ...
          mflop = (double) nqx * nqy * nqz * (68 * num_triangles + 20) / 1000000;
        #endif
        //gflops = nidle_num_procs * mflop / kernel_time;
        gflops = mflop / kernel_time;
        std::cout << "**            Kernel performance: " << gflops << " GFLOPS/s" << std::endl;
      } // if
    #ifdef USE_MPI
      world_comm.free("ff_num_row_comm");
      world_comm.free("ff_num_col_comm");
    } // if
    #endif

    #ifdef USE_MPI
      world_comm.barrier(comm_key);
      world_comm.free(real_world);
    #endif

    #ifdef FINDBLOCK
      } // block_t
      } // block_z
      } // block_y
      } // block_x
    #endif

    return true;
  } // NumericFormFactor::compute()
Example #9
0
	bool mat_exp(complex_vec_t& matrix, complex_vec_t& result) {
		result.clear();
		for(complex_vec_t::iterator i = matrix.begin(); i != matrix.end(); ++ i)
			result.push_back(exp(*i));
	} // mat_exp()