示例#1
0
sparseStatus_t sparseEngine_d::LoadKernel(sparsePrec_t prec,
	sparseEngine_d::Kernel** ppKernel) {
	
	// First attempt to load the finalize module if it is not yet loaded.
	CUresult result = CUDA_SUCCESS;

	// Check if the requested kernel is available, and if not, load it.
	int p = (int)prec;
	if(!multiply[p].get()) {
		std::auto_ptr<Kernel> k(new Kernel);
		
		std::string filename = kernelPath + "spmxv_" + PrecNames[p] +
			".cubin";
		result = context->LoadModuleFilename(filename, &k->module);
		if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_NOT_FOUND;

		// Load the five SpMxV kernels for different valuesPerThread counts.
		for(int i(0); i < NumVT; ++i) {
			std::ostringstream oss;
			oss<< "SpMxV_"<< ValuesPerThread[i];
			result = k->module->GetFunction(oss.str(), 
				make_int3(BlockSize, 1,1), &k->func[i]);
			if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR;
		}

		// Load the finalize function.
		result = k->module->GetFunction("Finalize", make_int3(BlockSize, 1, 1), 
			&k->finalize);
			if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR;

		// Cache the texture reference
		result = cuModuleGetTexRef(&k->xVec_texture, k->module->Handle(),
			"xVec_texture");
		if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR;

		result = cuTexRefSetFlags(k->xVec_texture, CU_TRSF_READ_AS_INTEGER);
		if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR;

		result = cuTexRefSetFormat(k->xVec_texture, PrecTerms[p].vecFormat, 
			PrecTerms[p].vecChannels);
		if(CUDA_SUCCESS != result) return SPARSE_STATUS_KERNEL_ERROR;

		multiply[p] = k;
	}

	*ppKernel = multiply[p].get();
	return SPARSE_STATUS_SUCCESS;
}
float interpolate_tricubic_simple(float* tex, float3 coord, uint3 volumeExtent)
{
	// transform the coordinate from [0,extent] to [-0.5, extent-0.5]
	const float3 coord_grid = coord - 0.5;
	float3 indexF = floor(coord_grid);
	const float3 fraction = coord_grid - indexF;
	int3 index = make_int3((int)indexF.x, (int)indexF.y, (int)indexF.z);
	//index = index + 0.5;  //move from [-0.5, extent-0.5] to [0, extent]

	float result = 0.0;
	for (int z=-1; z <= 2; z++)  //range [-1, 2]
	{
		float bsplineZ = bspline(z-fraction.z);
		int w = index.z + z;
		for (int y=-1; y <= 2; y++)
		{
			float bsplineYZ = bspline(y-fraction.y) * bsplineZ;
			int v = index.y + y;
			for (int x=-1; x <= 2; x++)
			{
				float bsplineXYZ = bspline(x-fraction.x) * bsplineYZ;
				int u = index.x + x;
				result += bsplineXYZ * tex3D(tex, u, v, w, volumeExtent);
			}
		}
	}
	return result;
}
   /**
   * read parameters from file
   */
   void HoomdPeriodicExternal::readParameters(std::istream &in)
   {
      // Read parameters
      prefactor_.allocate(nAtomType_);
      readDArray<double>(in, "prefactor", prefactor_, nAtomType_);

      read<double>(in, "externalParameter", externalParameter_);

      waveIntVectors_.allocate(3);
      readDArray<IntVector>(in, "waveIntVectors", waveIntVectors_, 3);
      read<double>(in, "interfaceWidth", interfaceWidth_);
      read<int>(in, "periodicity", periodicity_);
   
      for (int i = 0; i < nAtomType_; ++i) {
         params_[i].order_parameter = Scalar(prefactor_[i]*externalParameter_);
         params_[i].lattice_vector_1 = make_int3(waveIntVectors_[0][0], waveIntVectors_[0][1], waveIntVectors_[0][2]);
         params_[i].lattice_vector_2 = make_int3(waveIntVectors_[1][0], waveIntVectors_[1][1], waveIntVectors_[2][2]);
         params_[i].lattice_vector_3 = make_int3(waveIntVectors_[2][0], waveIntVectors_[2][1], waveIntVectors_[2][2]);
         params_[i].interface_width = Scalar(interfaceWidth_);
         params_[i].periodicity = periodicity_;
      }
   }
示例#4
0
bool LoadFunctions(const char* name, int numThreads, CuModule* module,
	FunctionPtr functions[6]) {

		// TODO: change to bits = 1
	for(int bits(1); bits <= 6; ++bits) {
		std::ostringstream oss;
		oss<< name<< "_"<< bits;
		CUresult result = module->GetFunction(oss.str(), 
			make_int3(numThreads, 1, 1), &functions[bits - 1]);
		if(CUDA_SUCCESS != result) return false;
	}
	return true;
}
示例#5
0
CUresult CreateMaxIndexEngine(const char* cubin, 
	std::auto_ptr<MaxIndexEngine>* ppEngine) {

	std::auto_ptr<MaxIndexEngine> e(new MaxIndexEngine);
		
	CUresult result = AttachCuContext(&e->context);
	if(CUDA_SUCCESS != result) return result;

	if(2 != e->context->Device()->ComputeCapability().first)
	return CUDA_ERROR_INVALID_DEVICE ;

	result = e->context->LoadModuleFilename(cubin, &e->module);
	if(CUDA_SUCCESS != result) return result;

	result = e->module->GetFunction("FindMaxIndexLoop", make_int3(256, 1, 1),
		&e->pass1);
	if(CUDA_SUCCESS != result) return result;

	result = e->module->GetFunction("FindMaxIndexReduce", make_int3(256, 1, 1),
		&e->pass2);
	if(CUDA_SUCCESS != result) return result;

	int numSMs = e->context->Device()->Attributes().multiprocessorCount;

	// Launch 6 256-thread blocks per SM.
	e->numBlocks = 6 * numSMs;

	// Allocate an element for each thread block.
	result = e->context->MemAlloc<float>(e->numBlocks, &e->maxMem);
	if(CUDA_SUCCESS != result) return result;
	result = e->context->MemAlloc<uint>(e->numBlocks, &e->indexMem);
	if(CUDA_SUCCESS != result) return result;

	result = e->context->MemAlloc<uint2>(e->numBlocks, &e->rangeMem);

	*ppEngine = e;
	return CUDA_SUCCESS;

}
示例#6
0
Distance3i distance_geom3(
    const int3 p, global const int* geometry, global const int3* verts) {
  // Distance3i best = { INT_MAX, (int3)(0, 0, 0) };
  Distance3i best = { INT_MAX, make_int3(0, 0, 0) };
  const int num_tris = geometry[1];
  const Triangle* tris = (Triangle*)(geometry+2);
  for (int j = 0; j < num_tris; ++j) {
    Triangle t = tris[j];
    const int3 tri_verts[3] =
        { (verts[t.s[0]]),
          (verts[t.s[1]]),
          (verts[t.s[2]]) };
    int3 set_verts[3];
    const int num_unique = find_unique(tri_verts, set_verts);

    if (num_unique == 3) {
      best = min_pair3i(best, distance_trianglei(p, tri_verts));
    } else {
      // Degenerate triangle
      if (num_unique == 1) {
        // Degenerate to a point
        int3 closest = tri_verts[0];
        float3 closest_d = convert_float3(closest);
        float3 p_d = convert_float3(p);
        int dist = (int)(fast_length(p_d-closest_d)+0.5f);
        best = min_pair3i(best, make_dist3(dist, closest));
      } else {
        // Degenerate to a line
        int3 a = set_verts[0];
        int3 b = set_verts[1];
        Distance3f dist = distance_line3(convert_float3(p),
                                        convert_float3(a),
                                        convert_float3(b));
        Distance3i disti = make_dist3(
            // convert_int_rte(dist.d),
            // convert_int3_rte(dist.p));
            convert_int(dist.d),
            convert_int3(dist.p));
        best = min_pair3i(best, disti);
      }
    }
  }
  return best;
}
示例#7
0
/* Finds the closest distance to a set of geometries */
PointAndLabel3 distance_geoms3(
    const int3 p_, global const int* geometries,
    global const int3* verts, global const int* verts_offsets) {
  const int3 p = p_;
  Distance3i min_dist = make_dist3(INT_MAX, make_int3(0, 0, 0));
  int idx = -1;
  for (int i = 0; i < geometries[0]; ++i) {
    int offset = geometries[i];
    const Distance3i dist = distance_geom3(
        p, geometries+offset, verts+verts_offsets[geometries[offset]]);

    if (dist.d < min_dist.d) {
      min_dist = dist;
      idx = i;
    }
  }
  // idx, idx_offset are correct.  Must be the actual point from
  // a higher-level dist function
  int offset = geometries[idx];
  const int label = geometries[offset];
  PointAndLabel3 pl = { min_dist.p, label };
  return pl;
}
示例#8
0
BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder,
                                 BVHSpatialStorage *storage,
                                 const BVHRange& range,
                                 vector<BVHReference> *references,
                                 float nodeSAH)
: sah(FLT_MAX),
  dim(0),
  pos(0.0f),
  storage_(storage),
  references_(references)
{
	/* initialize bins. */
	float3 origin = range.bounds().min;
	float3 binSize = (range.bounds().max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS);
	float3 invBinSize = 1.0f / binSize;

	for(int dim = 0; dim < 3; dim++) {
		for(int i = 0; i < BVHParams::NUM_SPATIAL_BINS; i++) {
			BVHSpatialBin& bin = storage_->bins[dim][i];

			bin.bounds = BoundBox::empty;
			bin.enter = 0;
			bin.exit = 0;
		}
	}

	/* chop references into bins. */
	for(unsigned int refIdx = range.start(); refIdx < range.end(); refIdx++) {
		const BVHReference& ref = references_->at(refIdx);
		float3 firstBinf = (ref.bounds().min - origin) * invBinSize;
		float3 lastBinf = (ref.bounds().max - origin) * invBinSize;
		int3 firstBin = make_int3((int)firstBinf.x, (int)firstBinf.y, (int)firstBinf.z);
		int3 lastBin = make_int3((int)lastBinf.x, (int)lastBinf.y, (int)lastBinf.z);

		firstBin = clamp(firstBin, 0, BVHParams::NUM_SPATIAL_BINS - 1);
		lastBin = clamp(lastBin, firstBin, BVHParams::NUM_SPATIAL_BINS - 1);

		for(int dim = 0; dim < 3; dim++) {
			BVHReference currRef = ref;

			for(int i = firstBin[dim]; i < lastBin[dim]; i++) {
				BVHReference leftRef, rightRef;

				split_reference(builder, leftRef, rightRef, currRef, dim, origin[dim] + binSize[dim] * (float)(i + 1));
				storage_->bins[dim][i].bounds.grow(leftRef.bounds());
				currRef = rightRef;
			}

			storage_->bins[dim][lastBin[dim]].bounds.grow(currRef.bounds());
			storage_->bins[dim][firstBin[dim]].enter++;
			storage_->bins[dim][lastBin[dim]].exit++;
		}
	}

	/* select best split plane. */
	for(int dim = 0; dim < 3; dim++) {
		/* sweep right to left and determine bounds. */
		BoundBox right_bounds = BoundBox::empty;

		storage_->right_bounds.resize(BVHParams::NUM_SPATIAL_BINS);
		for(int i = BVHParams::NUM_SPATIAL_BINS - 1; i > 0; i--) {
			right_bounds.grow(storage_->bins[dim][i].bounds);
			storage_->right_bounds[i - 1] = right_bounds;
		}

		/* sweep left to right and select lowest SAH. */
		BoundBox left_bounds = BoundBox::empty;
		int leftNum = 0;
		int rightNum = range.size();

		for(int i = 1; i < BVHParams::NUM_SPATIAL_BINS; i++) {
			left_bounds.grow(storage_->bins[dim][i - 1].bounds);
			leftNum += storage_->bins[dim][i - 1].enter;
			rightNum -= storage_->bins[dim][i - 1].exit;

			float sah = nodeSAH +
				left_bounds.safe_area() * builder.params.primitive_cost(leftNum) +
				storage_->right_bounds[i - 1].safe_area() * builder.params.primitive_cost(rightNum);

			if(sah < this->sah) {
				this->sah = sah;
				this->dim = dim;
				this->pos = origin[dim] + binSize[dim] * (float)i;
			}
		}
	}
}
示例#9
0
// Generate one single mesh from several ppm files.
bool SaveMeshFromPXMs(
    std::string                sDirName,
    std::string                sBBFileHead,
    int3                       nVolRes,
    int                        nGridRes,
    std::vector<std::string>   vfilename,
    std::string                sMeshFileName)
{
  printf("\n---- [Kangaroo/SaveMeshFromPXMs] Start.\n");

  // 1 ---------------------------------------------------------------------------
  // read all grid sdf and sort them into volumes. vVolume index is global index
  std::vector<SingleVolume>  vVolumes = GetFilesNeedSaving(vfilename);

  if(vVolumes.size()<=0)
  {
    printf("[Kangaroo/SaveMeshFromPXMs] Cannot find any files for generating the mesh!\n");
    return false;
  }

  // prepare data structure for the single mesh
  MarchingCUBERst ObjMesh;

  // 2 ---------------------------------------------------------------------------
  // For each global volume we have, gen mesh with it
  int nTotalSaveGridNum = 0;

  for(unsigned int i=0; i!=vVolumes.size(); i++)
  {
      std::cout<<"[Kangaroo/SaveMeshFromPXMs] Merging grids in global bb area ("<<
                 std::to_string(vVolumes[i].GlobalIndex.x)<<","<<
                 std::to_string(vVolumes[i].GlobalIndex.y)<<","<<
                 std::to_string(vVolumes[i].GlobalIndex.z)<<")"<< std::endl;

      int nSingleLoopSaveGridNum = 0;

      // load the corresponding bounding box
      std::string sBBFileName =
          sDirName + sBBFileHead +
          std::to_string(vVolumes[i].GlobalIndex.x) + "#" +
          std::to_string(vVolumes[i].GlobalIndex.y) + "#" +
          std::to_string(vVolumes[i].GlobalIndex.z);

      if( CheckIfBBfileExist(sBBFileName) )
      {
        // 1, --------------------------------------------------------------------
        // load the bounxing box of the sdf.
        // NOTICE that this is the GLOBAL bounding box, not the local one.
        // To load it from disk, we need to use host volume
        roo::BoundingBox BBox = LoadPXMBoundingBox(sBBFileName);

        roo::BoundedVolumeGrid<roo::SDF_t_Smart,roo::TargetHost,roo::Manage> hVol;
        hVol.Init(nVolRes.x, nVolRes.y, nVolRes.z, nGridRes, BBox);

        roo::BoundedVolumeGrid<float, roo::TargetHost, roo::Manage> hVolColor;
        hVolColor.Init(1,1,1, nGridRes, BBox);

        // 2, --------------------------------------------------------------------
        // for each single grid volume live in the global bounding box
        for(unsigned int j=0; j!=vVolumes[i].vLocalIndex.size(); j++)
        {
          int3 LocalIndex = vVolumes[i].vLocalIndex[j];

          int nRealIndex = hVol.ConvertLocalIndexToRealIndex(
                LocalIndex.x, LocalIndex.y,LocalIndex.z);

          std::string sPXMFile = sDirName + vVolumes[i].vFileName[j];

          // load the grid volume
          if(LoadPXMSingleGrid(sPXMFile, hVol.m_GridVolumes[nRealIndex]) == false )
          {
            std::cerr<<"[Kangaroo/SaveMeshFromPXMs] Error! load file fail.. exit."<<std::endl;
            return false;
          }
        }

        // 3, --------------------------------------------------------------------
        // for each grid in the whole volume
        for(unsigned int i=0;i!=hVol.m_nGridNum_w;i++)
        {
          for(unsigned int j=0;j!=hVol.m_nGridNum_h;j++)
          {
            for(unsigned int k=0;k!=hVol.m_nGridNum_d;k++)
            {
              if(hVol.CheckIfBasicSDFActive(hVol.ConvertLocalIndexToRealIndex(i,j,k)))
              {
                int3 CurLocalIndex = make_int3(i,j,k);

                GenMeshSingleGrid(hVol, hVolColor, CurLocalIndex, ObjMesh.verts,
                                  ObjMesh.norms, ObjMesh.faces, ObjMesh.colors);

                nTotalSaveGridNum++;
                nSingleLoopSaveGridNum ++;
              }
            }
          }
        }

        // 4, --------------------------------------------------------------------
        // reset grid
        roo::SdfReset(hVol);
        hVol.ResetAllGridVol();
      }
      else
      {
        std::cerr<<"[Kangaroo/SaveMeshFromPXMs] Error! Fail loading bbox "<<
                   sBBFileName<<std::endl;
        return false;
      }

      std::cout<<"[Kangaroo/SaveMeshFromPXMs] Finish merge "<<nSingleLoopSaveGridNum<<
                 " grids."<<std::endl;
  }

  std::cout<<"[Kangaroo/SaveMeshFromPXMs] Finish marching cube for " <<
             nTotalSaveGridNum<< " Grids.\n";

  // 3 ---------------------------------------------------------------------------
  // Save mesh from memory to hard disk
  aiMesh* mesh = MeshFromListsVector(ObjMesh.verts, ObjMesh.norms,
                                     ObjMesh.faces, ObjMesh.colors);

  return SaveMeshGridToFileAssimp(sMeshFileName, mesh, "obj");
}
示例#10
0
__host__
int3 make_int3( const Vector3i& v )
{
    return make_int3( v.x, v.y, v.z );
}
示例#11
0
long long int ParticleListCPUSorted::pushT(PlasmaData* pdata, FieldData* fields, HOMoments* moments)
{

	int tid;
	int nthreads = pdata->num_cores;
	int stride = (nptcls+nthreads-1)/nthreads;

	long long int nSubSteps_proc[nthreads];

	omp_set_num_threads(nthreads);

//	for(int i=0;i<pdata->nx;i++)
//	{
//		realkind temp;
//		temp = fields->intrpE(0.5,0,0,i,0,0,0,FieldData_deriv_f);
//		printf("fields[%i] on cpu = %f\n",i,temp);
//	}

	//printf("particles ")

	//printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH);


	// Start the parallel loop
#pragma omp parallel private(tid,stride) default(shared) num_threads(nthreads)
	{
		nthreads = omp_get_num_threads();
		//printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH);
		//nthreads = 1;
		stride = (nptcls+nthreads-1)/nthreads;

		tid = omp_get_thread_num();
		//tid = 0;

//	    auto cpu = sched_getcpu();
//	    std::ostringstream os;
//	        os<<"\nThread "<<omp_get_thread_num()<<" on cpu "<<sched_getcpu()<<std::endl;
//	        std::cout<<os.str()<<std::flush;

		PlasmaData pdata_local = *pdata;

		// Each thread gets a separate copy of the accumulation arrays
		HOMoments* my_moment = moments+tid;

		// Initialize the moment values
		//printf("Initializing moment values\n");
		my_moment->set_vals(0);

		int nSubcycle_max = pdata->nSubcycle_max;

		int ptcl_start,ptcl_end;
		int nptcls_process;
		int nptcls_left;
		int ishrink = 0;
		int nptcl_replacements = 0;

		int nptcl_done;
		//int iptcl_max;
		int iptcl_new_v[VEC_LENGTH];
		int iptcl_v[VEC_LENGTH];
		int iter_array_v[VEC_LENGTH];

		int* iptcl_new = iptcl_new_v;
		int* iptcl = iptcl_v;
		int* iter_array = iter_array_v;

		long long int nSubSteps_done = 0;

		ptcl_start = stride*tid;
		ptcl_end = fmin(stride*(tid+1)-1,nptcls-1);

		nptcls_process = ptcl_end-ptcl_start+1;

		//printf("Thread %i starting at %i to %i with %i ptcls\n",
			//	tid,ptcl_start,ptcl_end,nptcls_process);


		ParticleObjNT<VEC_LENGTH,nSpatial,nVel,iEM> particle(iptcl);

		// Populate the timers
		particle.piccard_timer = piccard_timer+tid;
		particle.accel_timer = accel_timer+tid;
		particle.tally_timer = tally_timer+tid;
		particle.crossing_timer = crossing_timer+tid;
		particle.dtau_est_timer = dtau_est_timer+tid;

//		ParticleObjN<VEC_LENGTH> particle(iptcl);

		typevecN<int,VEC_LENGTH> iter;


		iter = 0;
		for(int i=0;i<VEC_LENGTH;i++)
			iter_array[i] = 0;

		CurrentTallyCPU currents(&my_moment->get_val(0,0,0,ispecies,HOMoments_currentx),
							  &my_moment->get_val(0,0,0,ispecies,HOMoments_currenty),
							  &my_moment->get_val(0,0,0,ispecies,HOMoments_currentz),
							  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
							  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
							  moments->pdata->ndimensions);

		ChargeTally charge(&my_moment->get_val(0,0,0,ispecies,HOMoments_charge),
							  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
							  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
							  moments->pdata->ndimensions);

		StressTally stress(&moments->get_val(0,0,0,ispecies,HOMoments_S2xx),
				&moments->get_val(0,0,0,ispecies,HOMoments_S2xy),
				&moments->get_val(0,0,0,ispecies,HOMoments_S2xz),
				&moments->get_val(0,0,0,ispecies,HOMoments_S2yy),
				&moments->get_val(0,0,0,ispecies,HOMoments_S2yz),
				&moments->get_val(0,0,0,ispecies,HOMoments_S2zz),
							  moments->pdata->nx,moments->pdata->ny,moments->pdata->nz,
							  moments->pdata->ndimensions,moments->pdata->nVelocity);

		for(int i=0;i<VEC_LENGTH;i++)
			iptcl[i] = ptcl_start+i;

		nptcl_done = 0;


		load_store_timer[tid].start();
		particle = *this;




		//for(int i=0;i<VEC_LENGTH;i++)
		//	particle.dt_finished(i) = 0;

		// Each thread loops over its own particles
		// In order to avoid SIMD divergence we loop until
		// all particles in the threads work que have been
		// pushed. Anytime a particle finishes a subcycle
		// it is written back to the main list and a new particle
		// takes its slot
		while(nptcl_done < nptcls_process)
		{
			nptcls_left = nptcls_process-nptcl_done;

			//printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done);

			if((nptcls_left <= VEC_LENGTH)&&(VEC_LENGTH > 1))
			{
				if(ishrink == 0)
				{
					for(int j=0;j<VEC_LENGTH;j++)
					{
						//printf("iptcl[%i] = %i\n",j,iptcl[0][j]);
						particle.write_back(*this,j);
					}

					int k = 0;
					for(int l=0;l<VEC_LENGTH;l++)
					{

						bool idone = 0;

						//printf("iter2(%i) = %f\n",j,particles2.dt_finished(j));
						if(particle.dt_finished(l) >= pdata->dt)
						{
							idone = 1;
						}
						else if(iter(l) >= pdata->nSubcycle_max)
						{
							idone = 1;
//							printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[l],pdata->dt-particle.dt_finished(l));
						}
						else if(iptcl[l] > ptcl_end)
							idone = 1;
						else
							idone = 0;


						if(idone)
						{
							nSubSteps_done += iter(l);
							num_subcycles[iptcl[l]] += iter(l);
							iter(l) = 0;

							// Accumulate Charge and S2 moment

						}
						else
						{
							iptcl[k] = iptcl[l];
							iter_array[k] = iter(l);


							k++;
						}
					}

					nptcl_done = nptcls_process - k ;
					nptcls_left = k;

					ishrink = 1;
				}

// Hack to compile all versions of ParticleObjN template
				shrink_pushT<VEC_LENGTH,nSpatial,nVel,iEM>(pdata,fields,&currents,this,
									&iter_array,&iptcl,&iptcl_new,
									nptcls_left,nptcl_done,nptcls_process,nSubSteps_done);
//				shrink_push<VEC_LENGTH>(pdata,fields,&currents,this,
//									&iter_array,&iptcl,&iptcl_new,
//									nptcls_left,nptcl_done,nptcls_process,nSubSteps_done);


			}
			else
			{
//				for(int j=0;j<VEC_LENGTH;j++)
//					printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n",
//							iptcl[j],particle.px(j),particle.py(j),particle.pz(j),
//							particle.ix(j),particle.iy(j),particle.iz(j),
//							particle.vx(j),particle.vy(j),particle.vz(j));

				// Here our particle vector size is the same
				// size as our system vector size, and won't
				// change from step to step

				particle.push(pdata,fields,&currents,iter,nSubcycle_max);



				// Replace the particle (or particles) that
				// have finished their subcycle steps
				//int k = 0;
				for(int j=0;j<VEC_LENGTH;j++)
				{
					bool idone = 0;

					if(particle.dt_finished(j) >= pdata->dt)
					{
						idone = 1;
					}
					else if(iter(j) >= pdata->nSubcycle_max)
					{
						idone = 1;

//						printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[j],pdata->dt-particle.dt_finished(j));
					}

					if(idone)
					{
						// Accumulate Charge and S2 moment

//						printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n",
//								iptcl[j],particle.px(j),particle.py(j),particle.pz(j),
//								particle.ix(j),particle.iy(j),particle.iz(j),
//								particle.vx(j),particle.vy(j),particle.vz(j));

						// Write results, and get a new particle from the list
						particle.write_back(*this,j);

						num_subcycles[iptcl[j]] += iter(j);

						iptcl[j] = ptcl_start + nptcl_done + VEC_LENGTH;
						nptcl_done++;

						if(nptcls_process-nptcl_done > 0)
						{
							particle.copy_in(*this,j);
						}

						nSubSteps_done += iter(j);

						iter(j) = 0;
						particle.dt_finished(j) = 0.0f;


					}
				} /* for(int j=0;j<nptcls_left;j++) */
				//printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done);

			} /* else */

			nptcl_replacements++;

		} /* while(nptcl_done < nptcls_process) */

		load_store_timer[tid].stop();

		tally_timer2[tid].start();
		// accumulate charge and s2 moment
		for(int i=ptcl_start;i<=ptcl_end;i++)
		{
			charge.tally(px[i],py[i],pz[i],
					ix[i],iy[i],iz[i],
					1.0);

			stress.tally1d1v(px[i],
					vx[i],
					ix[i],
					1.0f);


			//if(fabs(dt_finished[i] - pdata->dt) > 1.0e-5)
			//	printf("particle %i dt_finished = %e\n",i,dt_finished[i]);

			dt_finished[i] = 0.0f;

		}
		tally_timer2[tid].stop();

		//nSubSteps_proc[0] = nSubSteps_done;

		nSubSteps_proc[tid] = nSubSteps_done;

//		printf("average particles processed per replacement: %f\n",nptcls_process/((double)nptcl_replacements));



	} /* pragma omp parallel */

	for(int i=1;i<nthreads;i++)
		nSubSteps_proc[0] += nSubSteps_proc[i];

	//printf("nsteps avg = %i\n",nSubSteps_proc[0]);

	return nSubSteps_proc[0];


}
示例#12
0
void ParticleListCPUSorted::init(ProblemInitializer* initializer, HOMoments* moments)
{

	CurrentTallyCPU currents(&moments->get_val(0,0,0,ispecies,HOMoments_currentx),
						  &moments->get_val(0,0,0,ispecies,HOMoments_currenty),
						  &moments->get_val(0,0,0,ispecies,HOMoments_currentz),
						  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
						  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
						  moments->pdata->ndimensions);

	ChargeTally charge(&moments->get_val(0,0,0,ispecies,HOMoments_charge),
						  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
						  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
						  moments->pdata->ndimensions);

	StressTally stress(&moments->get_val(0,0,0,ispecies,HOMoments_S2xx),
			&moments->get_val(0,0,0,ispecies,HOMoments_S2xy),
			&moments->get_val(0,0,0,ispecies,HOMoments_S2xz),
			&moments->get_val(0,0,0,ispecies,HOMoments_S2yy),
			&moments->get_val(0,0,0,ispecies,HOMoments_S2yz),
			&moments->get_val(0,0,0,ispecies,HOMoments_S2zz),
						  moments->pdata->nx,moments->pdata->ny,moments->pdata->nz,
						  moments->pdata->ndimensions,moments->pdata->nVelocity);

	moments -> set_vals(0);
#pragma omp for
	for(int i=0;i<nptcls;i++)
	{
		realkind px,py,pz,vx,vy,vz;
		int ix,iy,iz;

		initializer->init_particle(px,py,pz,ix,iy,iz,vx,vy,vz,ispecies,i);





		dt_finished[i] = 0;


		// Set Position Values, ifloat = 0-2gmake
		this->get_fvalue(i,0) = px;
		this->get_fvalue(i,1) = py;
		this->get_fvalue(i,2) = pz;

		// Set Position Index Values, iint = 0-2
		this->get_ivalue(i,0) = ix;
		this->get_ivalue(i,1) = iy;
		this->get_ivalue(i,2) = iz;

		// Set Velocity Values, ifloat = 3-5
		this->get_fvalue(i,3) = vx;
		this->get_fvalue(i,4) = vy;
		this->get_fvalue(i,5) = vz;
	}

	for(int i=0;i<nptcls;i++)
	{
	currents.tally(px[i],py[i],pz[i],vx[i],vy[i],vz[i],ix[i],iy[i],iz[i],1.0);

	charge.tally(px[i],py[i],pz[i],
			ix[i],iy[i],iz[i],
			1.0);

	stress.tally(px[i],py[i],pz[i],
			vx[i],vy[i],vz[i],
			ix[i],iy[i],iz[i],
			1.0);
	}

	memset(num_subcycles,0,nptcls*sizeof(int));

}