sparseStatus_t sparseEngine_d::LoadKernel(sparsePrec_t prec, sparseEngine_d::Kernel** ppKernel) { // First attempt to load the finalize module if it is not yet loaded. CUresult result = CUDA_SUCCESS; // Check if the requested kernel is available, and if not, load it. int p = (int)prec; if(!multiply[p].get()) { std::auto_ptr<Kernel> k(new Kernel); std::string filename = kernelPath + "spmxv_" + PrecNames[p] + ".cubin"; result = context->LoadModuleFilename(filename, &k->module); if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_NOT_FOUND; // Load the five SpMxV kernels for different valuesPerThread counts. for(int i(0); i < NumVT; ++i) { std::ostringstream oss; oss<< "SpMxV_"<< ValuesPerThread[i]; result = k->module->GetFunction(oss.str(), make_int3(BlockSize, 1,1), &k->func[i]); if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR; } // Load the finalize function. result = k->module->GetFunction("Finalize", make_int3(BlockSize, 1, 1), &k->finalize); if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR; // Cache the texture reference result = cuModuleGetTexRef(&k->xVec_texture, k->module->Handle(), "xVec_texture"); if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR; result = cuTexRefSetFlags(k->xVec_texture, CU_TRSF_READ_AS_INTEGER); if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR; result = cuTexRefSetFormat(k->xVec_texture, PrecTerms[p].vecFormat, PrecTerms[p].vecChannels); if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR; multiply[p] = k; } *ppKernel = multiply[p].get(); return SPARSE_STATUS_SUCCESS; }
float interpolate_tricubic_simple(float* tex, float3 coord, uint3 volumeExtent) { // transform the coordinate from [0,extent] to [-0.5, extent-0.5] const float3 coord_grid = coord - 0.5; float3 indexF = floor(coord_grid); const float3 fraction = coord_grid - indexF; int3 index = make_int3((int)indexF.x, (int)indexF.y, (int)indexF.z); //index = index + 0.5; //move from [-0.5, extent-0.5] to [0, extent] float result = 0.0; for (int z=-1; z <= 2; z++) //range [-1, 2] { float bsplineZ = bspline(z-fraction.z); int w = index.z + z; for (int y=-1; y <= 2; y++) { float bsplineYZ = bspline(y-fraction.y) * bsplineZ; int v = index.y + y; for (int x=-1; x <= 2; x++) { float bsplineXYZ = bspline(x-fraction.x) * bsplineYZ; int u = index.x + x; result += bsplineXYZ * tex3D(tex, u, v, w, volumeExtent); } } } return result; }
/** * read parameters from file */ void HoomdPeriodicExternal::readParameters(std::istream &in) { // Read parameters prefactor_.allocate(nAtomType_); readDArray<double>(in, "prefactor", prefactor_, nAtomType_); read<double>(in, "externalParameter", externalParameter_); waveIntVectors_.allocate(3); readDArray<IntVector>(in, "waveIntVectors", waveIntVectors_, 3); read<double>(in, "interfaceWidth", interfaceWidth_); read<int>(in, "periodicity", periodicity_); for (int i = 0; i < nAtomType_; ++i) { params_[i].order_parameter = Scalar(prefactor_[i]*externalParameter_); params_[i].lattice_vector_1 = make_int3(waveIntVectors_[0][0], waveIntVectors_[0][1], waveIntVectors_[0][2]); params_[i].lattice_vector_2 = make_int3(waveIntVectors_[1][0], waveIntVectors_[1][1], waveIntVectors_[2][2]); params_[i].lattice_vector_3 = make_int3(waveIntVectors_[2][0], waveIntVectors_[2][1], waveIntVectors_[2][2]); params_[i].interface_width = Scalar(interfaceWidth_); params_[i].periodicity = periodicity_; } }
bool LoadFunctions(const char* name, int numThreads, CuModule* module, FunctionPtr functions[6]) { // TODO: change to bits = 1 for(int bits(1); bits <= 6; ++bits) { std::ostringstream oss; oss<< name<< "_"<< bits; CUresult result = module->GetFunction(oss.str(), make_int3(numThreads, 1, 1), &functions[bits - 1]); if(CUDA_SUCCESS != result) return false; } return true; }
CUresult CreateMaxIndexEngine(const char* cubin, std::auto_ptr<MaxIndexEngine>* ppEngine) { std::auto_ptr<MaxIndexEngine> e(new MaxIndexEngine); CUresult result = AttachCuContext(&e->context); if(CUDA_SUCCESS != result) return result; if(2 != e->context->Device()->ComputeCapability().first) return CUDA_ERROR_INVALID_DEVICE ; result = e->context->LoadModuleFilename(cubin, &e->module); if(CUDA_SUCCESS != result) return result; result = e->module->GetFunction("FindMaxIndexLoop", make_int3(256, 1, 1), &e->pass1); if(CUDA_SUCCESS != result) return result; result = e->module->GetFunction("FindMaxIndexReduce", make_int3(256, 1, 1), &e->pass2); if(CUDA_SUCCESS != result) return result; int numSMs = e->context->Device()->Attributes().multiprocessorCount; // Launch 6 256-thread blocks per SM. e->numBlocks = 6 * numSMs; // Allocate an element for each thread block. result = e->context->MemAlloc<float>(e->numBlocks, &e->maxMem); if(CUDA_SUCCESS != result) return result; result = e->context->MemAlloc<uint>(e->numBlocks, &e->indexMem); if(CUDA_SUCCESS != result) return result; result = e->context->MemAlloc<uint2>(e->numBlocks, &e->rangeMem); *ppEngine = e; return CUDA_SUCCESS; }
Distance3i distance_geom3( const int3 p, global const int* geometry, global const int3* verts) { // Distance3i best = { INT_MAX, (int3)(0, 0, 0) }; Distance3i best = { INT_MAX, make_int3(0, 0, 0) }; const int num_tris = geometry[1]; const Triangle* tris = (Triangle*)(geometry+2); for (int j = 0; j < num_tris; ++j) { Triangle t = tris[j]; const int3 tri_verts[3] = { (verts[t.s[0]]), (verts[t.s[1]]), (verts[t.s[2]]) }; int3 set_verts[3]; const int num_unique = find_unique(tri_verts, set_verts); if (num_unique == 3) { best = min_pair3i(best, distance_trianglei(p, tri_verts)); } else { // Degenerate triangle if (num_unique == 1) { // Degenerate to a point int3 closest = tri_verts[0]; float3 closest_d = convert_float3(closest); float3 p_d = convert_float3(p); int dist = (int)(fast_length(p_d-closest_d)+0.5f); best = min_pair3i(best, make_dist3(dist, closest)); } else { // Degenerate to a line int3 a = set_verts[0]; int3 b = set_verts[1]; Distance3f dist = distance_line3(convert_float3(p), convert_float3(a), convert_float3(b)); Distance3i disti = make_dist3( // convert_int_rte(dist.d), // convert_int3_rte(dist.p)); convert_int(dist.d), convert_int3(dist.p)); best = min_pair3i(best, disti); } } } return best; }
/* Finds the closest distance to a set of geometries */ PointAndLabel3 distance_geoms3( const int3 p_, global const int* geometries, global const int3* verts, global const int* verts_offsets) { const int3 p = p_; Distance3i min_dist = make_dist3(INT_MAX, make_int3(0, 0, 0)); int idx = -1; for (int i = 0; i < geometries[0]; ++i) { int offset = geometries[i]; const Distance3i dist = distance_geom3( p, geometries+offset, verts+verts_offsets[geometries[offset]]); if (dist.d < min_dist.d) { min_dist = dist; idx = i; } } // idx, idx_offset are correct. Must be the actual point from // a higher-level dist function int offset = geometries[idx]; const int label = geometries[offset]; PointAndLabel3 pl = { min_dist.p, label }; return pl; }
BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder, BVHSpatialStorage *storage, const BVHRange& range, vector<BVHReference> *references, float nodeSAH) : sah(FLT_MAX), dim(0), pos(0.0f), storage_(storage), references_(references) { /* initialize bins. */ float3 origin = range.bounds().min; float3 binSize = (range.bounds().max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS); float3 invBinSize = 1.0f / binSize; for(int dim = 0; dim < 3; dim++) { for(int i = 0; i < BVHParams::NUM_SPATIAL_BINS; i++) { BVHSpatialBin& bin = storage_->bins[dim][i]; bin.bounds = BoundBox::empty; bin.enter = 0; bin.exit = 0; } } /* chop references into bins. */ for(unsigned int refIdx = range.start(); refIdx < range.end(); refIdx++) { const BVHReference& ref = references_->at(refIdx); float3 firstBinf = (ref.bounds().min - origin) * invBinSize; float3 lastBinf = (ref.bounds().max - origin) * invBinSize; int3 firstBin = make_int3((int)firstBinf.x, (int)firstBinf.y, (int)firstBinf.z); int3 lastBin = make_int3((int)lastBinf.x, (int)lastBinf.y, (int)lastBinf.z); firstBin = clamp(firstBin, 0, BVHParams::NUM_SPATIAL_BINS - 1); lastBin = clamp(lastBin, firstBin, BVHParams::NUM_SPATIAL_BINS - 1); for(int dim = 0; dim < 3; dim++) { BVHReference currRef = ref; for(int i = firstBin[dim]; i < lastBin[dim]; i++) { BVHReference leftRef, rightRef; split_reference(builder, leftRef, rightRef, currRef, dim, origin[dim] + binSize[dim] * (float)(i + 1)); storage_->bins[dim][i].bounds.grow(leftRef.bounds()); currRef = rightRef; } storage_->bins[dim][lastBin[dim]].bounds.grow(currRef.bounds()); storage_->bins[dim][firstBin[dim]].enter++; storage_->bins[dim][lastBin[dim]].exit++; } } /* select best split plane. */ for(int dim = 0; dim < 3; dim++) { /* sweep right to left and determine bounds. */ BoundBox right_bounds = BoundBox::empty; storage_->right_bounds.resize(BVHParams::NUM_SPATIAL_BINS); for(int i = BVHParams::NUM_SPATIAL_BINS - 1; i > 0; i--) { right_bounds.grow(storage_->bins[dim][i].bounds); storage_->right_bounds[i - 1] = right_bounds; } /* sweep left to right and select lowest SAH. */ BoundBox left_bounds = BoundBox::empty; int leftNum = 0; int rightNum = range.size(); for(int i = 1; i < BVHParams::NUM_SPATIAL_BINS; i++) { left_bounds.grow(storage_->bins[dim][i - 1].bounds); leftNum += storage_->bins[dim][i - 1].enter; rightNum -= storage_->bins[dim][i - 1].exit; float sah = nodeSAH + left_bounds.safe_area() * builder.params.primitive_cost(leftNum) + storage_->right_bounds[i - 1].safe_area() * builder.params.primitive_cost(rightNum); if(sah < this->sah) { this->sah = sah; this->dim = dim; this->pos = origin[dim] + binSize[dim] * (float)i; } } } }
// Generate one single mesh from several ppm files. bool SaveMeshFromPXMs( std::string sDirName, std::string sBBFileHead, int3 nVolRes, int nGridRes, std::vector<std::string> vfilename, std::string sMeshFileName) { printf("\n---- [Kangaroo/SaveMeshFromPXMs] Start.\n"); // 1 --------------------------------------------------------------------------- // read all grid sdf and sort them into volumes. vVolume index is global index std::vector<SingleVolume> vVolumes = GetFilesNeedSaving(vfilename); if(vVolumes.size()<=0) { printf("[Kangaroo/SaveMeshFromPXMs] Cannot find any files for generating the mesh!\n"); return false; } // prepare data structure for the single mesh MarchingCUBERst ObjMesh; // 2 --------------------------------------------------------------------------- // For each global volume we have, gen mesh with it int nTotalSaveGridNum = 0; for(unsigned int i=0; i!=vVolumes.size(); i++) { std::cout<<"[Kangaroo/SaveMeshFromPXMs] Merging grids in global bb area ("<< std::to_string(vVolumes[i].GlobalIndex.x)<<","<< std::to_string(vVolumes[i].GlobalIndex.y)<<","<< std::to_string(vVolumes[i].GlobalIndex.z)<<")"<< std::endl; int nSingleLoopSaveGridNum = 0; // load the corresponding bounding box std::string sBBFileName = sDirName + sBBFileHead + std::to_string(vVolumes[i].GlobalIndex.x) + "#" + std::to_string(vVolumes[i].GlobalIndex.y) + "#" + std::to_string(vVolumes[i].GlobalIndex.z); if( CheckIfBBfileExist(sBBFileName) ) { // 1, -------------------------------------------------------------------- // load the bounxing box of the sdf. // NOTICE that this is the GLOBAL bounding box, not the local one. // To load it from disk, we need to use host volume roo::BoundingBox BBox = LoadPXMBoundingBox(sBBFileName); roo::BoundedVolumeGrid<roo::SDF_t_Smart,roo::TargetHost,roo::Manage> hVol; hVol.Init(nVolRes.x, nVolRes.y, nVolRes.z, nGridRes, BBox); roo::BoundedVolumeGrid<float, roo::TargetHost, roo::Manage> hVolColor; hVolColor.Init(1,1,1, nGridRes, BBox); // 2, -------------------------------------------------------------------- // for each single grid volume live in the global bounding box for(unsigned int j=0; j!=vVolumes[i].vLocalIndex.size(); j++) { int3 LocalIndex = vVolumes[i].vLocalIndex[j]; int nRealIndex = hVol.ConvertLocalIndexToRealIndex( LocalIndex.x, LocalIndex.y,LocalIndex.z); std::string sPXMFile = sDirName + vVolumes[i].vFileName[j]; // load the grid volume if(LoadPXMSingleGrid(sPXMFile, hVol.m_GridVolumes[nRealIndex]) == false ) { std::cerr<<"[Kangaroo/SaveMeshFromPXMs] Error! load file fail.. exit."<<std::endl; return false; } } // 3, -------------------------------------------------------------------- // for each grid in the whole volume for(unsigned int i=0;i!=hVol.m_nGridNum_w;i++) { for(unsigned int j=0;j!=hVol.m_nGridNum_h;j++) { for(unsigned int k=0;k!=hVol.m_nGridNum_d;k++) { if(hVol.CheckIfBasicSDFActive(hVol.ConvertLocalIndexToRealIndex(i,j,k))) { int3 CurLocalIndex = make_int3(i,j,k); GenMeshSingleGrid(hVol, hVolColor, CurLocalIndex, ObjMesh.verts, ObjMesh.norms, ObjMesh.faces, ObjMesh.colors); nTotalSaveGridNum++; nSingleLoopSaveGridNum ++; } } } } // 4, -------------------------------------------------------------------- // reset grid roo::SdfReset(hVol); hVol.ResetAllGridVol(); } else { std::cerr<<"[Kangaroo/SaveMeshFromPXMs] Error! Fail loading bbox "<< sBBFileName<<std::endl; return false; } std::cout<<"[Kangaroo/SaveMeshFromPXMs] Finish merge "<<nSingleLoopSaveGridNum<< " grids."<<std::endl; } std::cout<<"[Kangaroo/SaveMeshFromPXMs] Finish marching cube for " << nTotalSaveGridNum<< " Grids.\n"; // 3 --------------------------------------------------------------------------- // Save mesh from memory to hard disk aiMesh* mesh = MeshFromListsVector(ObjMesh.verts, ObjMesh.norms, ObjMesh.faces, ObjMesh.colors); return SaveMeshGridToFileAssimp(sMeshFileName, mesh, "obj"); }
__host__ int3 make_int3( const Vector3i& v ) { return make_int3( v.x, v.y, v.z ); }
long long int ParticleListCPUSorted::pushT(PlasmaData* pdata, FieldData* fields, HOMoments* moments) { int tid; int nthreads = pdata->num_cores; int stride = (nptcls+nthreads-1)/nthreads; long long int nSubSteps_proc[nthreads]; omp_set_num_threads(nthreads); // for(int i=0;i<pdata->nx;i++) // { // realkind temp; // temp = fields->intrpE(0.5,0,0,i,0,0,0,FieldData_deriv_f); // printf("fields[%i] on cpu = %f\n",i,temp); // } //printf("particles ") //printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH); // Start the parallel loop #pragma omp parallel private(tid,stride) default(shared) num_threads(nthreads) { nthreads = omp_get_num_threads(); //printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH); //nthreads = 1; stride = (nptcls+nthreads-1)/nthreads; tid = omp_get_thread_num(); //tid = 0; // auto cpu = sched_getcpu(); // std::ostringstream os; // os<<"\nThread "<<omp_get_thread_num()<<" on cpu "<<sched_getcpu()<<std::endl; // std::cout<<os.str()<<std::flush; PlasmaData pdata_local = *pdata; // Each thread gets a separate copy of the accumulation arrays HOMoments* my_moment = moments+tid; // Initialize the moment values //printf("Initializing moment values\n"); my_moment->set_vals(0); int nSubcycle_max = pdata->nSubcycle_max; int ptcl_start,ptcl_end; int nptcls_process; int nptcls_left; int ishrink = 0; int nptcl_replacements = 0; int nptcl_done; //int iptcl_max; int iptcl_new_v[VEC_LENGTH]; int iptcl_v[VEC_LENGTH]; int iter_array_v[VEC_LENGTH]; int* iptcl_new = iptcl_new_v; int* iptcl = iptcl_v; int* iter_array = iter_array_v; long long int nSubSteps_done = 0; ptcl_start = stride*tid; ptcl_end = fmin(stride*(tid+1)-1,nptcls-1); nptcls_process = ptcl_end-ptcl_start+1; //printf("Thread %i starting at %i to %i with %i ptcls\n", // tid,ptcl_start,ptcl_end,nptcls_process); ParticleObjNT<VEC_LENGTH,nSpatial,nVel,iEM> particle(iptcl); // Populate the timers particle.piccard_timer = piccard_timer+tid; particle.accel_timer = accel_timer+tid; particle.tally_timer = tally_timer+tid; particle.crossing_timer = crossing_timer+tid; particle.dtau_est_timer = dtau_est_timer+tid; // ParticleObjN<VEC_LENGTH> particle(iptcl); typevecN<int,VEC_LENGTH> iter; iter = 0; for(int i=0;i<VEC_LENGTH;i++) iter_array[i] = 0; CurrentTallyCPU currents(&my_moment->get_val(0,0,0,ispecies,HOMoments_currentx), &my_moment->get_val(0,0,0,ispecies,HOMoments_currenty), &my_moment->get_val(0,0,0,ispecies,HOMoments_currentz), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); ChargeTally charge(&my_moment->get_val(0,0,0,ispecies,HOMoments_charge), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); StressTally stress(&moments->get_val(0,0,0,ispecies,HOMoments_S2xx), &moments->get_val(0,0,0,ispecies,HOMoments_S2xy), &moments->get_val(0,0,0,ispecies,HOMoments_S2xz), &moments->get_val(0,0,0,ispecies,HOMoments_S2yy), &moments->get_val(0,0,0,ispecies,HOMoments_S2yz), &moments->get_val(0,0,0,ispecies,HOMoments_S2zz), moments->pdata->nx,moments->pdata->ny,moments->pdata->nz, moments->pdata->ndimensions,moments->pdata->nVelocity); for(int i=0;i<VEC_LENGTH;i++) iptcl[i] = ptcl_start+i; nptcl_done = 0; load_store_timer[tid].start(); particle = *this; //for(int i=0;i<VEC_LENGTH;i++) // particle.dt_finished(i) = 0; // Each thread loops over its own particles // In order to avoid SIMD divergence we loop until // all particles in the threads work que have been // pushed. Anytime a particle finishes a subcycle // it is written back to the main list and a new particle // takes its slot while(nptcl_done < nptcls_process) { nptcls_left = nptcls_process-nptcl_done; //printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done); if((nptcls_left <= VEC_LENGTH)&&(VEC_LENGTH > 1)) { if(ishrink == 0) { for(int j=0;j<VEC_LENGTH;j++) { //printf("iptcl[%i] = %i\n",j,iptcl[0][j]); particle.write_back(*this,j); } int k = 0; for(int l=0;l<VEC_LENGTH;l++) { bool idone = 0; //printf("iter2(%i) = %f\n",j,particles2.dt_finished(j)); if(particle.dt_finished(l) >= pdata->dt) { idone = 1; } else if(iter(l) >= pdata->nSubcycle_max) { idone = 1; // printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[l],pdata->dt-particle.dt_finished(l)); } else if(iptcl[l] > ptcl_end) idone = 1; else idone = 0; if(idone) { nSubSteps_done += iter(l); num_subcycles[iptcl[l]] += iter(l); iter(l) = 0; // Accumulate Charge and S2 moment } else { iptcl[k] = iptcl[l]; iter_array[k] = iter(l); k++; } } nptcl_done = nptcls_process - k ; nptcls_left = k; ishrink = 1; } // Hack to compile all versions of ParticleObjN template shrink_pushT<VEC_LENGTH,nSpatial,nVel,iEM>(pdata,fields,¤ts,this, &iter_array,&iptcl,&iptcl_new, nptcls_left,nptcl_done,nptcls_process,nSubSteps_done); // shrink_push<VEC_LENGTH>(pdata,fields,¤ts,this, // &iter_array,&iptcl,&iptcl_new, // nptcls_left,nptcl_done,nptcls_process,nSubSteps_done); } else { // for(int j=0;j<VEC_LENGTH;j++) // printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n", // iptcl[j],particle.px(j),particle.py(j),particle.pz(j), // particle.ix(j),particle.iy(j),particle.iz(j), // particle.vx(j),particle.vy(j),particle.vz(j)); // Here our particle vector size is the same // size as our system vector size, and won't // change from step to step particle.push(pdata,fields,¤ts,iter,nSubcycle_max); // Replace the particle (or particles) that // have finished their subcycle steps //int k = 0; for(int j=0;j<VEC_LENGTH;j++) { bool idone = 0; if(particle.dt_finished(j) >= pdata->dt) { idone = 1; } else if(iter(j) >= pdata->nSubcycle_max) { idone = 1; // printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[j],pdata->dt-particle.dt_finished(j)); } if(idone) { // Accumulate Charge and S2 moment // printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n", // iptcl[j],particle.px(j),particle.py(j),particle.pz(j), // particle.ix(j),particle.iy(j),particle.iz(j), // particle.vx(j),particle.vy(j),particle.vz(j)); // Write results, and get a new particle from the list particle.write_back(*this,j); num_subcycles[iptcl[j]] += iter(j); iptcl[j] = ptcl_start + nptcl_done + VEC_LENGTH; nptcl_done++; if(nptcls_process-nptcl_done > 0) { particle.copy_in(*this,j); } nSubSteps_done += iter(j); iter(j) = 0; particle.dt_finished(j) = 0.0f; } } /* for(int j=0;j<nptcls_left;j++) */ //printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done); } /* else */ nptcl_replacements++; } /* while(nptcl_done < nptcls_process) */ load_store_timer[tid].stop(); tally_timer2[tid].start(); // accumulate charge and s2 moment for(int i=ptcl_start;i<=ptcl_end;i++) { charge.tally(px[i],py[i],pz[i], ix[i],iy[i],iz[i], 1.0); stress.tally1d1v(px[i], vx[i], ix[i], 1.0f); //if(fabs(dt_finished[i] - pdata->dt) > 1.0e-5) // printf("particle %i dt_finished = %e\n",i,dt_finished[i]); dt_finished[i] = 0.0f; } tally_timer2[tid].stop(); //nSubSteps_proc[0] = nSubSteps_done; nSubSteps_proc[tid] = nSubSteps_done; // printf("average particles processed per replacement: %f\n",nptcls_process/((double)nptcl_replacements)); } /* pragma omp parallel */ for(int i=1;i<nthreads;i++) nSubSteps_proc[0] += nSubSteps_proc[i]; //printf("nsteps avg = %i\n",nSubSteps_proc[0]); return nSubSteps_proc[0]; }
void ParticleListCPUSorted::init(ProblemInitializer* initializer, HOMoments* moments) { CurrentTallyCPU currents(&moments->get_val(0,0,0,ispecies,HOMoments_currentx), &moments->get_val(0,0,0,ispecies,HOMoments_currenty), &moments->get_val(0,0,0,ispecies,HOMoments_currentz), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); ChargeTally charge(&moments->get_val(0,0,0,ispecies,HOMoments_charge), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); StressTally stress(&moments->get_val(0,0,0,ispecies,HOMoments_S2xx), &moments->get_val(0,0,0,ispecies,HOMoments_S2xy), &moments->get_val(0,0,0,ispecies,HOMoments_S2xz), &moments->get_val(0,0,0,ispecies,HOMoments_S2yy), &moments->get_val(0,0,0,ispecies,HOMoments_S2yz), &moments->get_val(0,0,0,ispecies,HOMoments_S2zz), moments->pdata->nx,moments->pdata->ny,moments->pdata->nz, moments->pdata->ndimensions,moments->pdata->nVelocity); moments -> set_vals(0); #pragma omp for for(int i=0;i<nptcls;i++) { realkind px,py,pz,vx,vy,vz; int ix,iy,iz; initializer->init_particle(px,py,pz,ix,iy,iz,vx,vy,vz,ispecies,i); dt_finished[i] = 0; // Set Position Values, ifloat = 0-2gmake this->get_fvalue(i,0) = px; this->get_fvalue(i,1) = py; this->get_fvalue(i,2) = pz; // Set Position Index Values, iint = 0-2 this->get_ivalue(i,0) = ix; this->get_ivalue(i,1) = iy; this->get_ivalue(i,2) = iz; // Set Velocity Values, ifloat = 3-5 this->get_fvalue(i,3) = vx; this->get_fvalue(i,4) = vy; this->get_fvalue(i,5) = vz; } for(int i=0;i<nptcls;i++) { currents.tally(px[i],py[i],pz[i],vx[i],vy[i],vz[i],ix[i],iy[i],iz[i],1.0); charge.tally(px[i],py[i],pz[i], ix[i],iy[i],iz[i], 1.0); stress.tally(px[i],py[i],pz[i], vx[i],vy[i],vz[i], ix[i],iy[i],iz[i], 1.0); } memset(num_subcycles,0,nptcls*sizeof(int)); }