void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
                               const int nall, Atom<numtyp,acctyp> &atom, 
                               double *sublo, double *subhi, int *tag, 
                               int **nspecial, int **special, bool &success,
                               int &mn) {
  _nbor_time_avail=true;
  const int nt=inum+host_inum;

  // Calculate number of cells and allocate storage for binning as necessary
  int ncellx, ncelly, ncellz, ncell_3d;
  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
                                  2.0*_cell_size)/_cell_size));
  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
                                  2.0*_cell_size)/_cell_size));
  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
                                  2.0*_cell_size)/_cell_size));
  ncell_3d = ncellx * ncelly * ncellz;
  if (ncell_3d+1>_ncells) {
    dev_cell_counts.clear();
    dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
    if (_gpu_nbor==2) {
      if (_ncells>0) {
        host_cell_counts.clear();
        delete [] cell_iter;
      }
      cell_iter = new int[ncell_3d+1];
      host_cell_counts.alloc(ncell_3d+1,dev_nbor);
    }
    _ncells=ncell_3d+1;
    _cell_bytes=dev_cell_counts.row_bytes();
  }

  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);

  if (_maxspecial>0) {
    time_nbor.start();
    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
    view_nspecial.view(nspecial[0],nt*3,*dev);
    view_special.view(special[0],nt*_maxspecial,*dev);
    view_tag.view(tag,nall,*dev);
    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
    ucl_copy(atom.dev_tag,view_tag,nall,false);
    time_nbor.stop();
    if (_time_device)
      time_nbor.add_to_total();
    time_transpose.start();
    const int b2x=_block_cell_2d;
    const int b2y=_block_cell_2d;
    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
                             &_maxspecial,&nt);        
    time_transpose.stop();
  }
  
  // If binning on CPU, do this now
  if (_gpu_nbor==2) {
    double stime = MPI_Wtime();
    int *cell_id=atom.host_cell_id.begin();
    int *particle_id=atom.host_particle_id.begin();
    
    // Build cell list on CPU                               
    host_cell_counts.zero();
    double m_cell_size=-_cell_size;
    double dx=subhi[0]-sublo[0]+_cell_size;
    double dy=subhi[1]-sublo[1]+_cell_size;
    double dz=subhi[2]-sublo[2]+_cell_size;

    for (int i=0; i<nall; i++) {
      double px, py, pz;
      px=x[i][0]-sublo[0];
      py=x[i][1]-sublo[1];
      pz=x[i][2]-sublo[2];
      if (px<m_cell_size) px=m_cell_size;
      if (py<m_cell_size) py=m_cell_size;
      if (pz<m_cell_size) pz=m_cell_size;
      if (px>dx) px=dx;            
      if (py>dy) py=dy;            
      if (pz>dz) pz=dz;            
    
      int id=static_cast<int>(px/_cell_size + 1.0) + 
             static_cast<int>(py/_cell_size + 1.0) * ncellx +
             static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;
    
      cell_id[i]=id;
      host_cell_counts[id+1]++;
    }
    
    mn=0;
    for (int i=0; i<_ncells; i++)
      mn=std::max(mn,host_cell_counts[i]);
    mn*=8;
    set_nbor_block_size(mn/2);

    resize_max_neighbors<numtyp,acctyp>(mn,success);
    if (!success)
      return;
    _total_atoms=nt;

    cell_iter[0]=0;
    for (int i=1; i<_ncells; i++) {
      host_cell_counts[i]+=host_cell_counts[i-1];
      cell_iter[i]=host_cell_counts[i];
    }
    time_hybrid1.start();
    ucl_copy(dev_cell_counts,host_cell_counts,true);
    time_hybrid1.stop();
    for (int i=0; i<nall; i++) {
      int celli=cell_id[i];
      int ploc=cell_iter[celli];
      cell_iter[celli]++;
      particle_id[ploc]=i;
    }
    time_hybrid2.start();
    ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
    time_hybrid2.stop();
    _bin_time+=MPI_Wtime()-stime;
  }        

  time_kernel.start();

  _nbor_pitch=inum;
  _shared->neigh_tex.bind_float(atom.dev_x,4);

  // If binning on GPU, do this now
  if (_gpu_nbor==1) {
    const int neigh_block=_block_cell_id;
    const int GX=(int)ceil((float)nall/neigh_block);
    const numtyp sublo0=static_cast<numtyp>(sublo[0]);
    const numtyp sublo1=static_cast<numtyp>(sublo[1]);
    const numtyp sublo2=static_cast<numtyp>(sublo[2]);
    const numtyp subhi0=static_cast<numtyp>(subhi[0]);
    const numtyp subhi1=static_cast<numtyp>(subhi[1]);
    const numtyp subhi2=static_cast<numtyp>(subhi[2]);
    _shared->k_cell_id.set_size(GX,neigh_block);
    _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
                           &atom.dev_particle_id.begin(),
    				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
    				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);

    atom.sort_neighbor(nall);

    /* calculate cell count */
    _shared->k_cell_counts.set_size(GX,neigh_block);
    _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), 
                               &dev_cell_counts.begin(), &nall, &ncell_3d);
  } 
  
  /* build the neighbor list */
  const int cell_block=_block_nbor_build;
  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
                            &dev_cell_counts.begin(), &dev_nbor.begin(),
                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
                            &_max_nbors,&cell_size_cast,
                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
                            &_threads_per_atom);

  /* Get the maximum number of nbors and realloc if necessary */
  UCL_D_Vec<int> numj;
  numj.view_offset(inum,dev_nbor,inum);
  ucl_copy(host_acc,numj,inum,true);
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
    ucl_copy(host_offset,dev_host_numj,nt-inum,true);
  }
  
  if (_gpu_nbor!=2) {
    host_acc.sync();
    mn=host_acc[0];
    for (int i=1; i<nt; i++)
      mn=std::max(mn,host_acc[i]);
    set_nbor_block_size(mn);

    if (mn>_max_nbors) {  
      resize_max_neighbors<numtyp,acctyp>(mn,success);
      if (!success)
        return;
      time_kernel.stop();
      if (_time_device)
        time_kernel.add_to_total();
      build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
                      nspecial, special, success, mn);
      return;
    }
  }
  
  if (_maxspecial>0) {
    const int GX2=static_cast<int>(ceil(static_cast<double>
                                          (nt*_threads_per_atom)/cell_block));
    _shared->k_special.set_size(GX2,cell_block);
    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
                           &dev_nspecial.begin(), &dev_special.begin(), 
                           &inum, &nt, &_max_nbors, &_threads_per_atom);
  }
  time_kernel.stop();

  time_nbor.start();
  if (inum<nt) {
    ucl_copy(host_nbor,dev_host_nbor,true);
    host_nbor.sync();
  }
  time_nbor.stop();
}
Example #2
0
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
                                  const int nall, 
                                  PairGPUAtom<numtyp,acctyp> &atom, 
                                  double *sublo, double *subhi, int *tag, 
                                  int **nspecial, int **special, bool &success,
                                  int &mn) {
  const int nt=inum+host_inum;
  if (_maxspecial>0) {
    time_nbor.start();
    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
    view_nspecial.view(nspecial[0],nt*3,*dev);
    view_special.view(special[0],nt*_maxspecial,*dev);
    view_tag.view(tag,nall,*dev);
    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
    ucl_copy(atom.dev_tag,view_tag,nall,false);
    time_nbor.stop();
    time_nbor.add_to_total();
    time_kernel.start();
    const int b2x=_block_cell_2d;
    const int b2y=_block_cell_2d;
    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
                             &_maxspecial,&nt);        
  } else
    time_kernel.start();

  _nbor_pitch=inum;
  _shared->neigh_tex.bind_float(atom.dev_x,4);

  int ncellx, ncelly, ncellz, ncell_3d;
  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
                                  2.0*_cell_size)/_cell_size));
  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
                                  2.0*_cell_size)/_cell_size));
  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
                                  2.0*_cell_size)/_cell_size));
  ncell_3d = ncellx * ncelly * ncellz;
  UCL_D_Vec<int> cell_counts;
  cell_counts.alloc(ncell_3d+1,dev_nbor);
  _cell_bytes=cell_counts.row_bytes();

  /* build cell list on GPU */
  const int neigh_block=_block_cell_id;
  const int GX=(int)ceil((float)nall/neigh_block);
  const numtyp sublo0=static_cast<numtyp>(sublo[0]);
  const numtyp sublo1=static_cast<numtyp>(sublo[1]);
  const numtyp sublo2=static_cast<numtyp>(sublo[2]);
  const numtyp subhi0=static_cast<numtyp>(subhi[0]);
  const numtyp subhi1=static_cast<numtyp>(subhi[1]);
  const numtyp subhi2=static_cast<numtyp>(subhi[2]);
  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
  _shared->k_cell_id.set_size(GX,neigh_block);
  _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
                         &atom.dev_particle_id.begin(),
  				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
  				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);

  atom.sort_neighbor(nall);

  /* calculate cell count */
  _shared->k_cell_counts.set_size(GX,neigh_block);
  _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), 
                             &nall, &ncell_3d);

  /* build the neighbor list */
  const int cell_block=_block_nbor_build;
  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
                            &cell_counts.begin(), &dev_nbor.begin(),
                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
                            &_max_nbors,&cell_size_cast,
                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);

  /* Get the maximum number of nbors and realloc if necessary */
  UCL_D_Vec<int> numj;
  numj.view_offset(inum,dev_nbor,inum);
  ucl_copy(host_acc,numj,inum,false);
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
  }
  mn=host_acc[0];
  for (int i=1; i<nt; i++)
    mn=std::max(mn,host_acc[i]);

  if (mn>_max_nbors) {  
    mn=static_cast<int>(static_cast<double>(mn)*1.10);
    dev_nbor.clear();
    success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id,
                        UCL_READ_ONLY)==UCL_SUCCESS);
    _gpu_bytes=dev_nbor.row_bytes();
    if (_max_host>0) {
      host_nbor.clear();
      dev_host_nbor.clear();
      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
      success=success && (dev_host_nbor.alloc(mn*_max_host,
                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
      int *ptr=host_nbor.begin();
      for (int i=0; i<_max_host; i++) {
        host_jlist[i]=ptr;
        ptr+=mn;
      }                                                 
      _gpu_bytes+=dev_host_nbor.row_bytes();
    }
    if (_alloc_packed) {
      dev_packed.clear();
      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
                                           UCL_READ_ONLY)==UCL_SUCCESS);
      _gpu_bytes+=dev_packed.row_bytes();
    }
    if (!success)
      return;
    _max_nbors=mn;
    time_kernel.stop();
    time_kernel.add_to_total();
    build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
                    special, success, mn);
    return;
  }
  
  if (_maxspecial>0) {
    const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
    _shared->k_special.set_size(GX2,cell_block);
    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
                           &dev_nspecial.begin(), &dev_special.begin(), 
                           &inum, &nt, &_max_nbors);
  }
  time_kernel.stop();

  time_nbor.start();
  if (_gpu_host)
    ucl_copy(host_nbor,dev_host_nbor,false);
  time_nbor.stop();
}