void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int nall, Atom<numtyp,acctyp> &atom, double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success, int &mn) { _nbor_time_avail=true; const int nt=inum+host_inum; // Calculate number of cells and allocate storage for binning as necessary int ncellx, ncelly, ncellz, ncell_3d; ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) + 2.0*_cell_size)/_cell_size)); ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) + 2.0*_cell_size)/_cell_size)); ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) + 2.0*_cell_size)/_cell_size)); ncell_3d = ncellx * ncelly * ncellz; if (ncell_3d+1>_ncells) { dev_cell_counts.clear(); dev_cell_counts.alloc(ncell_3d+1,dev_nbor); if (_gpu_nbor==2) { if (_ncells>0) { host_cell_counts.clear(); delete [] cell_iter; } cell_iter = new int[ncell_3d+1]; host_cell_counts.alloc(ncell_3d+1,dev_nbor); } _ncells=ncell_3d+1; _cell_bytes=dev_cell_counts.row_bytes(); } const numtyp cell_size_cast=static_cast<numtyp>(_cell_size); if (_maxspecial>0) { time_nbor.start(); UCL_H_Vec<int> view_nspecial, view_special, view_tag; view_nspecial.view(nspecial[0],nt*3,*dev); view_special.view(special[0],nt*_maxspecial,*dev); view_tag.view(tag,nall,*dev); ucl_copy(dev_nspecial,view_nspecial,nt*3,false); ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); ucl_copy(atom.dev_tag,view_tag,nall,false); time_nbor.stop(); if (_time_device) time_nbor.add_to_total(); time_transpose.start(); const int b2x=_block_cell_2d; const int b2y=_block_cell_2d; const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x)); const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y)); _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), &_maxspecial,&nt); time_transpose.stop(); } // If binning on CPU, do this now if (_gpu_nbor==2) { double stime = MPI_Wtime(); int *cell_id=atom.host_cell_id.begin(); int *particle_id=atom.host_particle_id.begin(); // Build cell list on CPU host_cell_counts.zero(); double m_cell_size=-_cell_size; double dx=subhi[0]-sublo[0]+_cell_size; double dy=subhi[1]-sublo[1]+_cell_size; double dz=subhi[2]-sublo[2]+_cell_size; for (int i=0; i<nall; i++) { double px, py, pz; px=x[i][0]-sublo[0]; py=x[i][1]-sublo[1]; pz=x[i][2]-sublo[2]; if (px<m_cell_size) px=m_cell_size; if (py<m_cell_size) py=m_cell_size; if (pz<m_cell_size) pz=m_cell_size; if (px>dx) px=dx; if (py>dy) py=dy; if (pz>dz) pz=dz; int id=static_cast<int>(px/_cell_size + 1.0) + static_cast<int>(py/_cell_size + 1.0) * ncellx + static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly; cell_id[i]=id; host_cell_counts[id+1]++; } mn=0; for (int i=0; i<_ncells; i++) mn=std::max(mn,host_cell_counts[i]); mn*=8; set_nbor_block_size(mn/2); resize_max_neighbors<numtyp,acctyp>(mn,success); if (!success) return; _total_atoms=nt; cell_iter[0]=0; for (int i=1; i<_ncells; i++) { host_cell_counts[i]+=host_cell_counts[i-1]; cell_iter[i]=host_cell_counts[i]; } time_hybrid1.start(); ucl_copy(dev_cell_counts,host_cell_counts,true); time_hybrid1.stop(); for (int i=0; i<nall; i++) { int celli=cell_id[i]; int ploc=cell_iter[celli]; cell_iter[celli]++; particle_id[ploc]=i; } time_hybrid2.start(); ucl_copy(atom.dev_particle_id,atom.host_particle_id,true); time_hybrid2.stop(); _bin_time+=MPI_Wtime()-stime; } time_kernel.start(); _nbor_pitch=inum; _shared->neigh_tex.bind_float(atom.dev_x,4); // If binning on GPU, do this now if (_gpu_nbor==1) { const int neigh_block=_block_cell_id; const int GX=(int)ceil((float)nall/neigh_block); const numtyp sublo0=static_cast<numtyp>(sublo[0]); const numtyp sublo1=static_cast<numtyp>(sublo[1]); const numtyp sublo2=static_cast<numtyp>(sublo[2]); const numtyp subhi0=static_cast<numtyp>(subhi[0]); const numtyp subhi1=static_cast<numtyp>(subhi[1]); const numtyp subhi2=static_cast<numtyp>(subhi[2]); _shared->k_cell_id.set_size(GX,neigh_block); _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), &atom.dev_particle_id.begin(), &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); atom.sort_neighbor(nall); /* calculate cell count */ _shared->k_cell_counts.set_size(GX,neigh_block); _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &dev_cell_counts.begin(), &nall, &ncell_3d); } /* build the neighbor list */ const int cell_block=_block_nbor_build; _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), &dev_cell_counts.begin(), &dev_nbor.begin(), &dev_host_nbor.begin(), &dev_host_numj.begin(), &_max_nbors,&cell_size_cast, &ncellx, &ncelly, &ncellz, &inum, &nt, &nall, &_threads_per_atom); /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec<int> numj; numj.view_offset(inum,dev_nbor,inum); ucl_copy(host_acc,numj,inum,true); if (nt>inum) { UCL_H_Vec<int> host_offset; host_offset.view_offset(inum,host_acc,nt-inum); ucl_copy(host_offset,dev_host_numj,nt-inum,true); } if (_gpu_nbor!=2) { host_acc.sync(); mn=host_acc[0]; for (int i=1; i<nt; i++) mn=std::max(mn,host_acc[i]); set_nbor_block_size(mn); if (mn>_max_nbors) { resize_max_neighbors<numtyp,acctyp>(mn,success); if (!success) return; time_kernel.stop(); if (_time_device) time_kernel.add_to_total(); build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, special, success, mn); return; } } if (_maxspecial>0) { const int GX2=static_cast<int>(ceil(static_cast<double> (nt*_threads_per_atom)/cell_block)); _shared->k_special.set_size(GX2,cell_block); _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), &dev_host_numj.begin(), &atom.dev_tag.begin(), &dev_nspecial.begin(), &dev_special.begin(), &inum, &nt, &_max_nbors, &_threads_per_atom); } time_kernel.stop(); time_nbor.start(); if (inum<nt) { ucl_copy(host_nbor,dev_host_nbor,true); host_nbor.sync(); } time_nbor.stop(); }
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, const int nall, PairGPUAtom<numtyp,acctyp> &atom, double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success, int &mn) { const int nt=inum+host_inum; if (_maxspecial>0) { time_nbor.start(); UCL_H_Vec<int> view_nspecial, view_special, view_tag; view_nspecial.view(nspecial[0],nt*3,*dev); view_special.view(special[0],nt*_maxspecial,*dev); view_tag.view(tag,nall,*dev); ucl_copy(dev_nspecial,view_nspecial,nt*3,false); ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); ucl_copy(atom.dev_tag,view_tag,nall,false); time_nbor.stop(); time_nbor.add_to_total(); time_kernel.start(); const int b2x=_block_cell_2d; const int b2y=_block_cell_2d; const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x)); const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y)); _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), &_maxspecial,&nt); } else time_kernel.start(); _nbor_pitch=inum; _shared->neigh_tex.bind_float(atom.dev_x,4); int ncellx, ncelly, ncellz, ncell_3d; ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) + 2.0*_cell_size)/_cell_size)); ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) + 2.0*_cell_size)/_cell_size)); ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) + 2.0*_cell_size)/_cell_size)); ncell_3d = ncellx * ncelly * ncellz; UCL_D_Vec<int> cell_counts; cell_counts.alloc(ncell_3d+1,dev_nbor); _cell_bytes=cell_counts.row_bytes(); /* build cell list on GPU */ const int neigh_block=_block_cell_id; const int GX=(int)ceil((float)nall/neigh_block); const numtyp sublo0=static_cast<numtyp>(sublo[0]); const numtyp sublo1=static_cast<numtyp>(sublo[1]); const numtyp sublo2=static_cast<numtyp>(sublo[2]); const numtyp subhi0=static_cast<numtyp>(subhi[0]); const numtyp subhi1=static_cast<numtyp>(subhi[1]); const numtyp subhi2=static_cast<numtyp>(subhi[2]); const numtyp cell_size_cast=static_cast<numtyp>(_cell_size); _shared->k_cell_id.set_size(GX,neigh_block); _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), &atom.dev_particle_id.begin(), &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); atom.sort_neighbor(nall); /* calculate cell count */ _shared->k_cell_counts.set_size(GX,neigh_block); _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, &ncell_3d); /* build the neighbor list */ const int cell_block=_block_nbor_build; _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), &cell_counts.begin(), &dev_nbor.begin(), &dev_host_nbor.begin(), &dev_host_numj.begin(), &_max_nbors,&cell_size_cast, &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec<int> numj; numj.view_offset(inum,dev_nbor,inum); ucl_copy(host_acc,numj,inum,false); if (nt>inum) { UCL_H_Vec<int> host_offset; host_offset.view_offset(inum,host_acc,nt-inum); ucl_copy(host_offset,dev_host_numj,nt-inum,false); } mn=host_acc[0]; for (int i=1; i<nt; i++) mn=std::max(mn,host_acc[i]); if (mn>_max_nbors) { mn=static_cast<int>(static_cast<double>(mn)*1.10); dev_nbor.clear(); success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes=dev_nbor.row_bytes(); if (_max_host>0) { host_nbor.clear(); dev_host_nbor.clear(); success=success && (host_nbor.alloc(mn*_max_host,dev_nbor, UCL_RW_OPTIMIZED)==UCL_SUCCESS); success=success && (dev_host_nbor.alloc(mn*_max_host, dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); int *ptr=host_nbor.begin(); for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; ptr+=mn; } _gpu_bytes+=dev_host_nbor.row_bytes(); } if (_alloc_packed) { dev_packed.clear(); success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_packed.row_bytes(); } if (!success) return; _max_nbors=mn; time_kernel.stop(); time_kernel.add_to_total(); build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, special, success, mn); return; } if (_maxspecial>0) { const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block)); _shared->k_special.set_size(GX2,cell_block); _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), &dev_host_numj.begin(), &atom.dev_tag.begin(), &dev_nspecial.begin(), &dev_special.begin(), &inum, &nt, &_max_nbors); } time_kernel.stop(); time_nbor.start(); if (_gpu_host) ucl_copy(host_nbor,dev_host_nbor,false); time_nbor.stop(); }