Exemplos de nextPow2 em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: seek_test.cpp Projeto: RichardsGameStudio/libtheoraplayer

void init()
{
	mgr=new TheoraVideoManager();
	clip=mgr->createVideoClip(new TheoraMemoryFileDataSource("media/bunny" + resourceExtension), TH_RGB, 4);
	clip->setAutoRestart(1);

	tex_id=createTexture(nextPow2(clip->getWidth()), nextPow2(clip->getHeight()));
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: zbitmapdesc.cpp Projeto: zsimpson/zbslib

void ZBitmapDesc::initSP2 ( int _w, int _h, int _d, char *_bits ) {
	clear();
	w = _w;
	h = _h;
	d = _d;
	memW = max( nextPow2(w), nextPow2(h) );
	memH = max( nextPow2(w), nextPow2(h) );
	bits = _bits;
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: CudaNarrowphase.cpp Projeto: amattwithers/aphid

void CudaNarrowphase::computeContacts(CUDABuffer * overlappingPairBuf, unsigned numOverlappingPairs)
{
    if(numOverlappingPairs < 1) return;
	m_numPairs = numOverlappingPairs;
	
	m_coord->create(nextPow2(numOverlappingPairs * 16));
	m_contact[0]->create(nextPow2(numOverlappingPairs * 48));
	m_contact[1]->create(nextPow2(numOverlappingPairs * 48));
	
	void * overlappingPairs = overlappingPairBuf->bufferOnDevice();
	computeTimeOfImpact(overlappingPairs, numOverlappingPairs);
	squeezeContacts(overlappingPairs, numOverlappingPairs);
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: av_player.cpp Projeto: tecan/LunaLibs

void init()
{


	mgr=new TheoraVideoManager();
	iface_factory=new OpenAL_AudioInterfaceFactory();
	mgr->setAudioInterfaceFactory(iface_factory);
	clip=mgr->createVideoClip("media/bunny.ogg");
//  use this if you want to preload the file into ram and stream from there
//	clip=mgr->createVideoClip(new TheoraMemoryFileDataSource("../media/short.ogg"),TH_RGB);
	clip->setAutoRestart(1);

	tex_id=createTexture(nextPow2(clip->getWidth()),nextPow2(clip->getHeight()));
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int ComputeLocalSplits_p(cl::Buffer &internalBRTNodes, cl::Buffer &localSplits, cl_int size) {
    startBenchmark("ComputeLocalSplits_p");
    cl_int globalSize = nextPow2(size);
    cl::Kernel &kernel = CLFW::Kernels["ComputeLocalSplitsKernel"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;

    bool isOld;
    cl::Buffer zeroBuffer;

    cl_int error  = CLFW::get(localSplits, "localSplits", sizeof(cl_int) * globalSize);
    error |= CLFW::get(zeroBuffer, "zeroBuffer", sizeof(cl_int) * globalSize, isOld);

    //Fill any new zero buffers with zero. Then initialize localSplits with zero.
    if (!isOld) {
        cl_int zero = 0;
        error |= queue.enqueueFillBuffer<cl_int>(zeroBuffer, { zero }, 0, sizeof(cl_int) * globalSize);
    }
    error |= queue.enqueueCopyBuffer(zeroBuffer, localSplits, 0, 0, sizeof(cl_int) * globalSize);

    error |= kernel.setArg(0, localSplits);
    error |= kernel.setArg(1, internalBRTNodes);
    error |= kernel.setArg(2, size);

    error = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);
    stopBenchmark();
    return error;
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: CRoutine_Sum.cpp Projeto: fabienbaron/liboi

/// Initializes the parallel sum object to sum num_element entries from a cl_mem buffer.
/// allocate_temp_buffers: if true will automatically allocate/deallocate buffers. Otherwise you need to do this elsewhere
void CRoutine_Sum::Init(int n)
{
	int err = CL_SUCCESS;

	mInputSize = n;
	mBufferSize = n;

	// The NVidia SDK kernel on which this routine is based is designed only for power-of-two
	// sized buffers. Because of this, we'll create internal buffers that round up to the
	// next highest power of two.
	if(!isPow2(mBufferSize))
		mBufferSize = nextPow2(mBufferSize);

	// TODO: Workaround for issue 32
	// https://github.com/bkloppenborg/liboi/issues/32
	if(mBufferSize < 128)
		mBufferSize = 128;

	BuildKernels();

	if(mTempBuffer1 == NULL)
	{
		mTempBuffer1 = clCreateBuffer(mContext, CL_MEM_READ_WRITE, mBufferSize * sizeof(cl_float), NULL, &err);
		mTempBuffer2 = clCreateBuffer(mContext, CL_MEM_READ_WRITE, mBufferSize * sizeof(cl_float), NULL, &err);
		COpenCL::CheckOCLError("Could not create parallel sum temporary buffer.", err);
	}
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int RadixSortBigUnsigned(cl::Buffer &input, cl_int size, cl_int mbits) {
    cl_int error = 0;
    const size_t globalSize = nextPow2(size);

    cl::Buffer predicate, address, bigUnsignedTemp, temp;
    error |= CLFW::get(address, "address", sizeof(cl_int)*(globalSize));
    error |= CLFW::get(bigUnsignedTemp, "bigUnsignedTemp", sizeof(BigUnsigned)*globalSize);

    if (error != CL_SUCCESS) return error;
    //For each bit
    startBenchmark("RadixSortBigUnsigned");
    for (unsigned int index = 0; index < mbits; index++) {
        //Predicate the 0's and 1's
        error |= BitPredicate(input, predicate, index, 0, globalSize);

        //Scan the predication buffers.
        error |= StreamScan_p(predicate, address, globalSize);

        //Compacting
        error |= DoubleCompact(input, bigUnsignedTemp, predicate, address, globalSize);

        //Swap result with input.
        temp = input;
        input = bigUnsignedTemp;
        bigUnsignedTemp = temp;
    }
    stopBenchmark();
    return error;
}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: CRoutine_Sum_NVidia.cpp Projeto: bkloppenborg/liboi

/// Initializes the parallel sum object to sum num_element entries from a cl_mem buffer.
/// allocate_temp_buffers: if true will automatically allocate/deallocate buffers. Otherwise you need to do this elsewhere
void CRoutine_Sum_NVidia::Init(int n)
{
	int status = CL_SUCCESS;

	mInputSize = n;
	mBufferSize = n;

	// The NVidia SDK kernel on which this routine is based is designed only for power-of-two
	// sized buffers. Because of this, we'll create internal buffers that round up to the
	// next highest power of two.
	if(!isPow2(mBufferSize))
		mBufferSize = nextPow2(mBufferSize);

	// TODO: Workaround for issue 32 in which kernel fails to compute sums for N = [33 - 64]
	// https://github.com/bkloppenborg/liboi/issues/32
	if(mBufferSize < 128)
		mBufferSize = 128;

	BuildKernels();

	if(mTempBuffer1 == NULL)
	{
		mTempBuffer1 = clCreateBuffer(mContext, CL_MEM_READ_WRITE, mBufferSize * sizeof(cl_float), NULL, &status);
		CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.");
	}

	if(mTempBuffer2 == NULL)
	{
		mTempBuffer2 = clCreateBuffer(mContext, CL_MEM_READ_WRITE, mBufferSize * sizeof(cl_float), NULL, &status);
		CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.");
	}
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: CRoutine_Sum.cpp Projeto: fabienbaron/liboi

void CRoutine_Sum::getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
    if (whichKernel < 3)
    {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    }
    else
    {
        threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }


    if (whichKernel == 6)
        blocks = min(maxBlocks, blocks);
}

Exemplo n.º 10

0

Exibir arquivo

Arquivo: multiple_videos.cpp Projeto: RichardsGameStudio/libtheoraplayer

void init()
{
	printf("---\nUSAGE: press buttons 1,2,3 or 4 to change the number of worker threads\n---\n");

	std::string files[] = {"media/bunny" + resourceExtension,
		                   "media/konqi" + resourceExtension,
		                   "media/room" + resourceExtension,
		                   "media/titan" + resourceExtension};
	mgr=new TheoraVideoManager(4);
	mgr->setDefaultNumPrecachedFrames(16);
	for (int i=0;i<4;i++)
	{
		clips[i]=mgr->createVideoClip(new TheoraMemoryFileDataSource(files[i]), outputMode);

		clips[i]->setAutoRestart(1);
		textures[i]=createTexture(nextPow2(clips[i]->getWidth()),nextPow2(clips[i]->getHeight()), textureFormat);
	}
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: zbitmapdesc.cpp Projeto: zsimpson/zbslib

void ZBitmapDesc::initP2W( int _w, int _h, int _d, char *_bits ) {
	clear();
	w = _w;
	h = _h;
	d = _d;
	memW = nextPow2(w);
	memH = h;
	bits = _bits;
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int UploadPoints(const vector<intn> &points, cl::Buffer &pointsBuffer) {
    startBenchmark("Uploading points");
    cl_int error = 0;
    cl_int roundSize = nextPow2(points.size());
    error |= CLFW::get(pointsBuffer, "pointsBuffer", sizeof(intn)*roundSize);
    error |= CLFW::DefaultQueue.enqueueWriteBuffer(pointsBuffer, CL_TRUE, 0, sizeof(cl_int2) * points.size(), points.data());
    stopBenchmark();
    return error;
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: CRoutine_Sum_NVidia.cpp Projeto: bkloppenborg/liboi

void CRoutine_Sum_NVidia::getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{

	threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
	blocks = (n + (threads * 2 - 1)) / (threads * 2);

	if (whichKernel == 6)
		blocks = min(maxBlocks, blocks);
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: kernelevaluation.cpp Projeto: OrcusCZ/SVMbenchmark

/**
 * Evaluate a row of the gram matrix
 * @param d_xtraindata device pointer to the training set
 * @param d_dottraindata device pointer to the array containing the dot product of the row with itself
 * @param d_kernelrow device pointer that will store the array extracted from d_xtraindata.
 * @param d_kerneldot device pointer that will store the result of the kernel evaluation
 * @param d_kdata device pointer to the matrix that stores the cached values
 * @param gid index that points to the point in d_xtraindata to be  calculated
 * @param cacheid index that points to the location in cache that will keep the results
 * @param ntraining number of training samples in the training set
 * @param nfeatures number of features in the training samples
 * @param beta value of the parameter of the RBF kernel
 * @param a if using polynomial or sigmoid kernel the value of a x_i x_j
 * @param b if using polynomial or sigmoid kernel the value of b
 * @param d if using polynomial kernel
 * @param kernelcode code that indicates the kernel type to run
 */
void kerneleval ( 	float* d_xtraindata,
					float* d_dottraindata,
					float* d_kernelrow,
					float* d_kerneldot,
					float* d_kdata,
					int gid,
					int cacheid,
					int ntraining,
					int nfeatures,
					float beta,
					float a,
					float b,
					float d,
					int kernelcode)
{

	int numThreads = (nfeatures < MAXTHREADS*2) ? nextPow2((nfeatures + 1)/ 2) : MAXTHREADS;
	int numBlocks = (nfeatures + (numThreads * 2 - 1)) / (numThreads * 2);
	int numBlocksRed = min(MAXBLOCKS, numBlocks);

	dim3 dimBlockKernelRow(numThreads, 1, 1);
	dim3 dimGridKernelRow(numBlocksRed, 1, 1);

	int smemSize = 0;
	bool isNtrainingPow2=isPow2(nfeatures);


	if(isNtrainingPow2)
	{
		switch (numThreads)
		{
			case 512:
				ExtractKernelRow <512,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case 256:
				ExtractKernelRow <256,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case 128:
				ExtractKernelRow <128,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case 64:
				ExtractKernelRow <64,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case 32:
				ExtractKernelRow <32,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case 16:
				ExtractKernelRow <16,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case  8:
				ExtractKernelRow <8,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case  4:
				ExtractKernelRow <4,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case  2:
				ExtractKernelRow <2,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
			case  1:
				ExtractKernelRow <1,true><<< dimGridKernelRow, dimBlockKernelRow, smemSize >>>(d_xtraindata,d_kernelrow,	gid,ntraining,nfeatures); break;
		}
	}
	else
	{
		switch (numThreads)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: reduction.cpp Projeto: drolfe00/CUDAVerificationkernels

////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the given reduction kernel
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
// n/2. For kernels < 3, we set to the minimum of maxThreads and n.  For kernel
// 6, we observe the maximum specified number of blocks, because each thread in
// that kernel can process a variable number of elements.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{

    //get device capability, to avoid block/grid size exceed the upper bound
    cudaDeviceProp prop;
    int device;
    checkCudaErrors(cudaGetDevice(&device));
    checkCudaErrors(cudaGetDeviceProperties(&prop, device));

    if (whichKernel < 3)
    {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    }
    else
    {
        threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }

    if ((float)threads*blocks > (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock)
    {
        printf("n is too large, please choose a smaller number!\n");
    }

    if (blocks > prop.maxGridSize[0])
    {
        printf("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
               blocks, prop.maxGridSize[0], threads*2, threads);

        blocks /= 2;
        threads *= 2;
    }

    if (whichKernel == 6)
    {
        blocks = MIN(maxBlocks, blocks);
    }
}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int PointsToMorton_s(cl_int size, cl_int bits, cl_int2* points, BigUnsigned* result) {
    startBenchmark("PointsToMorton_s");
    int nextPowerOfTwo = nextPow2(size);
    for (int gid = 0; gid < nextPowerOfTwo; ++gid) {
        if (gid < size) {
            xyz2z(&result[gid], points[gid], bits);
        }
        else {
            initBlkBU(&result[gid], 0);
        }
    }
    stopBenchmark();
    return 0;
}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int PointsToMorton_p(cl::Buffer &points, cl::Buffer &zpoints, cl_int size, cl_int bits) {
    cl_int error = 0;
    size_t globalSize = nextPow2(size);
    error |= CLFW::get(zpoints, "zpoints", globalSize * sizeof(BigUnsigned));
    cl::Kernel kernel = CLFW::Kernels["PointsToMortonKernel"];
    error |= kernel.setArg(0, zpoints);
    error |= kernel.setArg(1, points);
    error |= kernel.setArg(2, size);
    error |= kernel.setArg(3, bits);
    startBenchmark("PointsToMorton_p");
    error |= CLFW::DefaultQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nextPow2(size)), cl::NullRange);
    stopBenchmark();
    return error;
};

Exemplo n.º 18

0

Exibir arquivo

Arquivo: seek_test.cpp Projeto: RichardsGameStudio/libtheoraplayer

void draw()
{
	glBindTexture(GL_TEXTURE_2D,tex_id);

	if (!needsSeek)
	{
		TheoraVideoFrame* f=clip->getNextFrame();
		if (f)
		{
			glTexSubImage2D(GL_TEXTURE_2D,0,0,0,clip->getWidth(),f->getHeight(),GL_RGB,GL_UNSIGNED_BYTE,f->getBuffer());
			needsSeek = 1;
			if (f->getFrameNumber() != cFrame)
				nWrongSeeks++;
			cFrame++;
			if (cFrame >= clip->getNumFrames()) cFrame = 0;
			printf("Displayed frame %d\n", f->getFrameNumber());
			clip->popFrame();
		}
	}


	float w=clip->getWidth(),h=clip->getHeight();
	float tw=nextPow2(w),th=nextPow2(h);

	glEnable(GL_TEXTURE_2D);
	if (shader_on) enable_shader();
	drawTexturedQuad(tex_id,0,0,800,600,w/tw,h/th);
	if (shader_on) disable_shader();

	glDisable(GL_TEXTURE_2D);
	drawColoredQuad(0,570,800,30,0,0,0,1);
	drawWiredQuad(0,570,800,30,1,1,1,1);

	float x=clip->getTimePosition()/clip->getDuration();
	drawColoredQuad(3,573,794*x,24,1,1,1,1);
}

Exemplo n.º 19

0

Exibir arquivo

Arquivo: lauralogger.cpp Projeto: Kruczkowski/LibLaura

/**
 * @brief LauraLogger::LauraLogger
 * @param pathToDir
 *      Dest path to save logs
 * @param flag
 *      Types of logs
 * @param frameSize
 *      Size of audio frame
 * @param samplingRate
 *      Sampling frequency
 */
LauraLogger::LauraLogger(std::string pathToDir, unsigned int flag, 
                         unsigned int frameSize, unsigned int samplingRate){
    this->pathToDir = pathToDir;
    this->flags = flag;
    this->BUFF_SIZE = nextPow2(frameSize);
    this->firstRunning = true;
    this->SAMPLING_RATE = samplingRate;
    this->fileNames = new std::string[6];
    fileNames[0] = "TDOA";
    fileNames[1] = "ITD";
    fileNames[2] = "ILD";
    fileNames[3] = "CORRELATION";
    fileNames[4] = "SPECTRUM";
    fileNames[5] = "STREAM";
}

Exemplo n.º 20

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int BinaryRadixToOctree_p(cl::Buffer &internalBRTNodes, vector<OctNode> &octree_vec, cl_int size) {
    startBenchmark("BinaryRadixToOctree_p");
    int globalSize = nextPow2(size);
    cl::Kernel &kernel = CLFW::Kernels["BRT2OctreeKernel"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;

    cl::Buffer localSplits, scannedSplits, octree;
    cl_int error = CLFW::get(scannedSplits, "scannedSplits", sizeof(cl_int) * globalSize);

    error |= ComputeLocalSplits_p(internalBRTNodes, localSplits, size);
    error |= StreamScan_p(localSplits, scannedSplits, globalSize);

    //Read in the required octree size
    cl_int octreeSize;
    error |= CLFW::DefaultQueue.enqueueReadBuffer(scannedSplits, CL_TRUE, sizeof(int)*(size - 2), sizeof(int), &octreeSize);
    cl_int roundOctreeSize = nextPow2(octreeSize);

    //Create an octree buffer.
    error |= CLFW::get(octree, "octree", sizeof(OctNode) * roundOctreeSize);

    //use the scanned splits & brt to create octree.
    InitOctree(internalBRTNodes, octree, localSplits, scannedSplits, size, octreeSize);

    error |= kernel.setArg(0, internalBRTNodes);
    error |= kernel.setArg(1, octree);
    error |= kernel.setArg(2, localSplits);
    error |= kernel.setArg(3, scannedSplits);
    error |= kernel.setArg(4, size);

    error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);

    octree_vec.resize(octreeSize);
    error |= queue.enqueueReadBuffer(octree, CL_TRUE, 0, sizeof(OctNode)*octreeSize, octree_vec.data());
    stopBenchmark();
    return error;
}

Exemplo n.º 21

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int BuildBinaryRadixTree_p(cl::Buffer &zpoints, cl::Buffer &internalBRTNodes, cl_int size, cl_int mbits) {
    startBenchmark("BuildBinaryRadixTree_p");
    cl::Kernel &kernel = CLFW::Kernels["BuildBinaryRadixTreeKernel"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;
    cl_int globalSize = nextPow2(size);

    cl_int error = CLFW::get(internalBRTNodes, "internalBRTNodes", sizeof(BrtNode)* (globalSize));

    error |= kernel.setArg(0, internalBRTNodes);
    error |= kernel.setArg(1, zpoints);
    error |= kernel.setArg(2, mbits);
    error |= kernel.setArg(3, size);
    error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);
    stopBenchmark();
    return error;
}

Exemplo n.º 22

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int InitOctree(cl::Buffer &internalBRTNodes, cl::Buffer &octree, cl::Buffer &localSplits, cl::Buffer &scannedSplits, cl_int size, cl_int octreeSize) {
    startBenchmark("InitOctree");
    cl_int globalSize = nextPow2(octreeSize);
    cl::Kernel &kernel = CLFW::Kernels["BRT2OctreeKernel_init"];
    cl::CommandQueue &queue = CLFW::DefaultQueue;
    cl_int error = 0;

    error |= kernel.setArg(0, internalBRTNodes);
    error |= kernel.setArg(1, octree);
    error |= kernel.setArg(2, localSplits);
    error |= kernel.setArg(3, scannedSplits);
    error |= kernel.setArg(4, size);

    error |= queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize), cl::NullRange);
    stopBenchmark();
    return error;
}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: RecastTileBuilder.cpp Projeto: ModTheGalaxy/mtgserver

unsigned char* RecastTileBuilder::build(float x, float y, const AABB& lastTileBounds, int& dataSize) {
	int gw = 0, gh = 0;

	float bmin[3];
	float bmax[3];

	bmin[0] = bounds.getXMin();
	bmin[1] = bounds.getYMin();
	bmin[2] = bounds.getZMin();


	bmax[0] = bounds.getXMax();
	bmax[1] = bounds.getYMax();
	bmax[2] = bounds.getZMax();

	rcCalcGridSize(bmin, bmax, settings.m_cellSize, &gw, &gh);
	const int ts = (int) settings.m_tileSize;
	const int tw = (gw + ts - 1) / ts;
	const int th = (gh + ts - 1) / ts;

	// Max tiles and max polys affect how the tile IDs are caculated.
	// There are 22 bits available for identifying a tile and a polygon.
	int tileBits = rcMin((int) ilog2(nextPow2(tw * th)), 14);
	int polyBits = 22 - tileBits;
	m_maxTiles = 1<<tileBits;
	m_maxPolysPerTile = 1<<polyBits;

	dtNavMeshParams params;
	params.orig[0] = bounds.getXMin();
	params.orig[1] = bounds.getYMin();
	params.orig[2] = bounds.getZMin();

	//rcVcopy(params.orig, m_geom->getNavMeshBoundsMin());
	params.tileWidth = settings.m_tileSize * settings.m_cellSize;
	params.tileHeight = settings.m_tileSize * settings.m_cellSize;
	params.maxTiles = m_maxTiles;
	params.maxPolys = m_maxPolysPerTile;

	dtStatus status;
	this->lastTileBounds = lastTileBounds;
	return buildTileMesh(x, y, dataSize);
}

Exemplo n.º 24

0

Exibir arquivo

Arquivo: Kernels.cpp Projeto: edwardsjohnmartin/pgvd

cl_int UniqueSorted(cl::Buffer &input, cl_int &size) {
    startBenchmark("UniqueSorted");
    int globalSize = nextPow2(size);
    cl_int error = 0;

    cl::Buffer predicate, address, intermediate, result;
    error  = CLFW::get(predicate, "predicate", sizeof(cl_int)*(globalSize));
    error |= CLFW::get(address, "address", sizeof(cl_int)*(globalSize));
    error |= CLFW::get(result, "result", sizeof(BigUnsigned) * globalSize);

    error |= UniquePredicate(input, predicate, globalSize);
    error |= StreamScan_p(predicate, address, globalSize);
    error |= SingleCompact(input, result, predicate, address, globalSize);

    input = result;

    error |= CLFW::DefaultQueue.enqueueReadBuffer(address, CL_TRUE, (sizeof(cl_int)*globalSize - (sizeof(cl_int))), sizeof(cl_int), &size);
    stopBenchmark();
    return error;
}

Exemplo n.º 25

0

Exibir arquivo

Arquivo: noiseprocessor.cpp Projeto: sunwj/inviwo

void NoiseProcessor::perlinNoise(Image *img) {
    auto size = nextPow2(std::max(size_.get().x, size_.get().y));
    std::vector<std::unique_ptr<Image>> levels;
    std::vector<TemplateImageSampler<float,float>> samplers;
    auto currentSize = std::pow(2, levels_.get().x);
    auto iterations = levels_.get().y - levels_.get().x + 1;
    float currentPersistance = 1;
    while (currentSize <= size && iterations--) {
        size2_t imgsize{static_cast<size_t>(currentSize)};
        auto img1 = util::make_unique<Image>(imgsize, DataFLOAT32::get());
        randomNoise(img1.get(), -currentPersistance, currentPersistance);
        samplers.push_back(TemplateImageSampler<float,float>(img1.get()));
        levels.push_back(std::move(img1));
        currentSize *= 2;
        currentPersistance *= persistence_.get();
    }

    auto data = static_cast<float *>(
        img->getColorLayer()->getEditableRepresentation<LayerRAM>()->getData());
    float repri = 1.0 / size;
    // size_t index = 0;
    util::IndexMapper2D index(size_.get());
#pragma omp parallel for
    for (long long y = 0; y < size_.get().y; y++) {
        for (long long x = 0; x < size_.get().x; x++) {
            float v = 0;
            float X = x * repri;
            float Y = y * repri;
            for (auto &sampler : samplers) {
                v += sampler.sample(X, Y);
            }
            v = (v + 1.0f) / 2.0f;
            data[index(x, size_.get().y - 1 - y)] = glm::clamp(v, 0.0f, 1.0f);
        }
    }
}

Exemplo n.º 26

0

Exibir arquivo

Arquivo: Sample_TileMesh.cpp Projeto: ArtStealer/recastnavigation

void Sample_TileMesh::handleSettings()
{
	Sample::handleCommonSettings();

	if (imguiCheck("Keep Itermediate Results", m_keepInterResults))
		m_keepInterResults = !m_keepInterResults;

	if (imguiCheck("Build All Tiles", m_buildAll))
		m_buildAll = !m_buildAll;
	
	imguiLabel("Tiling");
	imguiSlider("TileSize", &m_tileSize, 16.0f, 1024.0f, 16.0f);
	
	if (m_geom)
	{
		char text[64];
		int gw = 0, gh = 0;
		const float* bmin = m_geom->getNavMeshBoundsMin();
		const float* bmax = m_geom->getNavMeshBoundsMax();
		rcCalcGridSize(bmin, bmax, m_cellSize, &gw, &gh);
		const int ts = (int)m_tileSize;
		const int tw = (gw + ts-1) / ts;
		const int th = (gh + ts-1) / ts;
		snprintf(text, 64, "Tiles  %d x %d", tw, th);
		imguiValue(text);

		// Max tiles and max polys affect how the tile IDs are caculated.
		// There are 22 bits available for identifying a tile and a polygon.
		int tileBits = rcMin((int)ilog2(nextPow2(tw*th)), 14);
		if (tileBits > 14) tileBits = 14;
		int polyBits = 22 - tileBits;
		m_maxTiles = 1 << tileBits;
		m_maxPolysPerTile = 1 << polyBits;
		snprintf(text, 64, "Max Tiles  %d", m_maxTiles);
		imguiValue(text);
		snprintf(text, 64, "Max Polys  %d", m_maxPolysPerTile);
		imguiValue(text);
	}
	else
	{
		m_maxTiles = 0;
		m_maxPolysPerTile = 0;
	}
	
	imguiSeparator();
	
	imguiIndent();
	imguiIndent();
	
	if (imguiButton("Save"))
	{
		Sample::saveAll("all_tiles_navmesh.bin", m_navMesh);
	}

	if (imguiButton("Load"))
	{
		dtFreeNavMesh(m_navMesh);
		m_navMesh = Sample::loadAll("all_tiles_navmesh.bin");
		m_navQuery->init(m_navMesh, 2048);
	}

	imguiUnindent();
	imguiUnindent();
	
	char msg[64];
	snprintf(msg, 64, "Build Time: %.1fms", m_totalBuildTimeMs);
	imguiLabel(msg);
	
	imguiSeparator();
	
	imguiSeparator();
	
}

Exemplo n.º 27

0

Exibir arquivo

Arquivo: fc_texture.cpp Projeto: gavindi/Stephen-Fraser-FC-Engine

// Internal function - Called to load up a texture map - file is known to exist
Texture *textureFromBitmap(bitmap *loadbm, Texture *tex)
{	bitmap *swizzleBm, *scaleBm, *resizeCanvasSrc;
	bool mustSwizzle = false;
	bool mustScale = false;

	// Step 1: Check for a need to swizzle (if the video card doesn't support this texture mode)
	uintf dataType = loadbm->flags & (bitmap_DataTypeMask | bitmap_DataInfoMask);
	// ### This code is not yet complete, must leave this block with 'swizzleBM' pointing to swizzled bitmap data

	// If Video card only handles 'Power-of-2' texture dimensions
	bool resizeCanvas=false;
	// if (GLESWarnings) //!(videoFeatures & videodriver_nonP2Tex))
	{	uintf newCanvasWidth = loadbm->width;
		uintf newCanvasHeight = loadbm->height;

		if (!isPow2(loadbm->width))
		{	resizeCanvas=true;
			newCanvasWidth=nextPow2(loadbm->width);
		}
		if (!isPow2(loadbm->height))
		{	resizeCanvas=true;
			newCanvasHeight=nextPow2(loadbm->height);
		}
		if (resizeCanvas)
		{	resizeCanvasSrc = loadbm;
			loadbm = newbitmap("resizeCanvasP2Tex",newCanvasWidth,newCanvasHeight,bitmap_ARGB32);
			uintf x,y;
			uint32 *src32 = (uint32 *)resizeCanvasSrc->pixel;
			uint32 *dst32 = (uint32 *)loadbm->pixel;
			for (y=0; y<resizeCanvasSrc->height; y++)
			{	uint32 *src = &src32[y*(resizeCanvasSrc->width)];
				uint32 *dst = &dst32[y*newCanvasWidth];
				for (x=0; x<resizeCanvasSrc->width; x++)
					*dst++ = *src++;
				for (;x<newCanvasWidth; x++)
					*dst++ = 0;
			}
			for (;y<newCanvasHeight; y++)
			{	uint32 *dst = &dst32[y*newCanvasWidth];
				for (x=0; x<newCanvasWidth; x++)
					*dst++=0;
			}
		}

/*		// Work out X scale
		uintf size = 1;
		while (size<=maxtexwidth)
		{	if (newx<=size) break;
			size <<=1;
		}
		if (size>maxtexwidth) size = maxtexwidth;
		newx = size;

		// Work out Y scale
		size = 1;
		while (size<=maxtexheight)
		{	if (newy<=size) break;
			size <<=1;
		}
		if (size>maxtexheight) size = maxtexheight;
		newy = size;
*/
	}

	// Step 2: Check if we need to resize - this may change swizzle mode
	uintf newx = loadbm->width;
	uintf newy = loadbm->height;
	if (newx>maxtexwidth)
		newx = maxtexwidth;
	if (newy>maxtexheight)
		newy = maxtexheight;

	if (newx!=loadbm->width || newy!=loadbm->height)
	{	dataType = bitmap_DataTypeRGB | bitmap_RGB_32bit;
		mustSwizzle = true;
		mustScale = true;
	}

	if (mustSwizzle)
	{	dataType |= loadbm->flags & bitmap_AlphaMask;
		swizzleBm = SwizzleBitmap(loadbm, dataType);
	}	else
		swizzleBm = loadbm;

	if (mustScale)
	{	// Bitmap needs to be resized before hardware will accept it
		scaleBm = scalebitmap(swizzleBm,newx,newy);
	}	else
		scaleBm = swizzleBm;

	// If we don't have a texture provided, create a new one
	if (!tex)
		tex = newTexture(NULL, 0, 0);
	downloadbitmaptex(tex, scaleBm, 0);
	estimatedtexmemused += tex->texmemused;
	if (mustScale)
		deleteBitmap(scaleBm);
	if (mustSwizzle)
		deleteBitmap(swizzleBm);
	if (resizeCanvas)
	{	deleteBitmap(loadbm);
		loadbm = resizeCanvasSrc;
		tex->flags |= texture_canvasSize;
		tex->UVscale.x = (float)loadbm->width / (float)tex->width;
		tex->UVscale.y = (float)loadbm->height/ (float)tex->height;
	}
	return tex;
}

Exemplo n.º 28

0

Exibir arquivo

Arquivo: WCellRecast.cpp Projeto: Bootz/WCell

dtNavMesh* buildMesh(InputGeom* geom, WCellBuildContext* wcellContext, int numCores)
{
	dtNavMesh* mesh = 0;

	if (!geom || !geom->getMesh())
	{
		CleanupAfterBuild();
		wcellContext->log(RC_LOG_ERROR, "buildTiledNavigation: No vertices and triangles.");
		return 0;
	}
	
	mesh = dtAllocNavMesh();
	if (!mesh)
	{
		CleanupAfterBuild();
		wcellContext->log(RC_LOG_ERROR, "buildTiledNavigation: Could not allocate navmesh.");
		return 0;
	}

	// setup some default parameters
	rcConfig cfg;
	memset(&cfg, 0, sizeof(rcConfig));
	const float agentHeight = 2.1f;				// most character toons are about this tall
	const float agentRadius = 0.6f;				// most character toons are about this big around
	const float agentClimb = 1.0f;				// character toons can step up this far. Seems ridiculously high ...
	const float tileSize = 1600.0f/3.0f/16.0f;	// The size of one chunk

	cfg.cs = 0.1f;										// cell size is a sort of resolution -> the bigger the faster
	cfg.ch = 0.05f;										// cell height -> distance from mesh to ground, if too low, recast will not build essential parts of the mesh for some reason
	cfg.walkableSlopeAngle = 50.0f;						// max climbable slope, bigger values won't make much of a change
	cfg.walkableHeight = (int)ceilf(agentHeight/cfg.ch);// minimum space to ceiling
	cfg.walkableClimb = (int)floorf(agentClimb/cfg.ch); // how high the agent can climb in one step
	cfg.walkableRadius = (int)ceilf(agentRadius/cfg.cs);// minimum distance to objects
	cfg.tileSize = (int)(tileSize/cfg.cs + 0.5f);
	cfg.maxEdgeLen = cfg.tileSize/2;;
	cfg.borderSize = cfg.walkableRadius + 3;
	cfg.width = cfg.tileSize + cfg.borderSize*2;
	cfg.height = cfg.tileSize + cfg.borderSize*2;	
	cfg.maxSimplificationError = 1.3f;
	cfg.minRegionArea = (int)rcSqr(8);		// Note: area = size*size
	cfg.mergeRegionArea = (int)rcSqr(20);	// Note: area = size*size
	cfg.maxVertsPerPoly = 3;
	cfg.detailSampleDist = cfg.cs * 9;
	cfg.detailSampleMaxError = cfg.ch * 1.0f;

	// default calculations - for some reason not included in basic recast
	const float* bmin = geom->getMeshBoundsMin();
	const float* bmax = geom->getMeshBoundsMax();
	
	int gw = 0, gh = 0;
	rcCalcGridSize(bmin, bmax, cfg.cs, &gw, &gh);
	const int ts = cfg.tileSize;
	const int tw = (gw + ts-1) / ts;
	const int th = (gh + ts-1) / ts;

	// Max tiles and max polys affect how the tile IDs are caculated.
	// There are 22 bits available for identifying a tile and a polygon.
	int tileBits = rcMin((int)ilog2(nextPow2(tw*th)), 14);
	if (tileBits > 14) tileBits = 14;
	int polyBits = 22 - tileBits;
	int maxTiles = 1 << tileBits;
	int maxPolysPerTile = 1 << polyBits;

	dtNavMeshParams params;
	rcVcopy(params.orig, geom->getMeshBoundsMin());
	params.tileWidth = cfg.tileSize * cfg.cs;
	params.tileHeight = cfg.tileSize * cfg.cs;
	params.maxTiles = maxTiles;
	params.maxPolys = maxPolysPerTile;
	
	dtStatus status;
	
	status = mesh->init(&params);
	if (dtStatusFailed(status))
	{
		CleanupAfterBuild();
		wcellContext->log(RC_LOG_ERROR, "buildTiledNavigation: Could not init navmesh.");
		return 0;
	}
	
	// start building
	const float tcs = cfg.tileSize*cfg.cs;
	wcellContext->startTimer(RC_TIMER_TEMP);
	
	TileAdder Adder;

	dispatcher.Reset();
	dispatcher.maxHeight = th;
	dispatcher.maxWidth = tw;

	int numThreads = 0;
	numThreads = std::min(2*numCores, 8);

	boost::thread *threads[8];
	for(int i = 0; i < numThreads; ++i)
	{
		QuadrantTiler newTiler;
		newTiler.geom = geom;
		newTiler.cfg = cfg;
		newTiler.ctx = *wcellContext;
		boost::thread newThread(boost::ref(newTiler));
		threads[i] = &newThread;
	}
	
	Adder.mesh = mesh;
	Adder.numThreads = numThreads;
	boost::thread AdderThread(boost::ref(Adder));
	
	AdderThread.join();

	// Start the build process.	
	wcellContext->stopTimer(RC_TIMER_TEMP);

	return mesh;
}

Exemplo n.º 29

0

Exibir arquivo

Arquivo: Font.cpp Projeto: xconstruct/game-engine

void Font::makeDisplayList(FT_Face face, char ch) {
    // Load Glyph for character
    if (FT_Load_Glyph(face, FT_Get_Char_Index(face, ch), FT_LOAD_DEFAULT))
        throw std::runtime_error("FT_Load_Glyph failed");

    // Move Glyph into object
    FT_Glyph glyph;
    if (FT_Get_Glyph(face->glyph, &glyph))
        throw std::runtime_error("FT_Get_Glyph failed");

    // Convert Glyph to Bitmap
    FT_Glyph_To_Bitmap(&glyph, ft_render_mode_normal, 0, 1);
    FT_BitmapGlyph bitmap_glyph = (FT_BitmapGlyph)glyph;
    FT_Bitmap &bitmap = bitmap_glyph->bitmap;

    // Resize to OpenGL power of 2 and two channels (luminosity and alpha)
    int width = nextPow2(bitmap.width);
    int height = nextPow2(bitmap.rows);
    GLubyte *expandedData = new GLubyte[2 * width * height];
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            int pos = 2 * (x + y*width);
            expandedData[pos] = expandedData[pos+1] =
                                    (x >= bitmap.width || y >= bitmap.rows) ?
                                    0 : bitmap.buffer[x + bitmap.width * y];
        }
    }

    // Create OpenGL texture
    glBindTexture(GL_TEXTURE_2D, m_textures[ch]);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0,
                 GL_LUMINANCE_ALPHA, GL_UNSIGNED_BYTE, expandedData);

    delete[] expandedData;

    // Now we create the Display List
    glNewList(m_displayLists+ch, GL_COMPILE);
    glBindTexture(GL_TEXTURE_2D, m_textures[ch]);
    glPushMatrix();

    // Center character correctly
    glTranslatef(bitmap_glyph->left, 0, 0);
    glTranslatef(0, bitmap_glyph->top - bitmap.rows, 0);

    // Calculate real size versus padding space
    float x = float(bitmap.width) / float(width);
    float y = float(bitmap.rows) / float(height);

    // Draw the quad
    glBegin(GL_QUADS);
    glTexCoord2d(0, 0);
    glVertex2f(0, bitmap.rows);
    glTexCoord2d(0, y);
    glVertex2f(0, 0);
    glTexCoord2d(x, y);
    glVertex2f(bitmap.width, 0);
    glTexCoord2d(x, 0);
    glVertex2f(bitmap.width, bitmap.rows);
    glEnd();
    glPopMatrix();
    glTranslatef(face->glyph->advance.x >> 6, 0, 0);

    // Increment the raster position as if it were a bitmap font
    // glBitmap(0, 0, 0, 0, face->glyph->advance.x >> 6, 0, NULL);

    glEndList();
}

Exemplo n.º 30

0

Exibir arquivo

Arquivo: SimpleContactSolver.cpp Projeto: kkaushalp/aphid

void SimpleContactSolver::solveContacts(unsigned numContacts,
										CUDABuffer * contactBuf,
										CUDABuffer * pairBuf,
										void * objectData)
{
#if DISABLE_COLLISION_RESOLUTION
	return;
#endif
    if(numContacts < 1) return; 
    
	m_numContacts = numContacts;
	const unsigned indBufLength = iRound1024(numContacts * 2);
	
	m_sortedInd[0]->create(indBufLength * 8);	
	m_sortedInd[1]->create(indBufLength * 8);
	
	void * bodyContactHash = m_sortedInd[0]->bufferOnDevice();
	void * pairs = pairBuf->bufferOnDevice();
	
	simpleContactSolverWriteContactIndex((KeyValuePair *)bodyContactHash, (uint *)pairs, numContacts * 2, indBufLength);
	
	void * tmp = m_sortedInd[1]->bufferOnDevice();
	RadixSort((KeyValuePair *)bodyContactHash, (KeyValuePair *)tmp, indBufLength, 30);
	
	m_splitPair->create(numContacts * 8);
	void * splits = m_splitPair->bufferOnDevice();
	
	const unsigned splitBufLength = numContacts * 2;
	simpleContactSolverComputeSplitBufLoc((uint2 *)splits, 
	                        (uint2 *)pairs, 
	                        (KeyValuePair *)bodyContactHash, 
	                        splitBufLength);
	
	m_bodyCount->create(splitBufLength * 4);
	void * bodyCount = m_bodyCount->bufferOnDevice();
	simpleContactSolverCountBody((uint *)bodyCount, 
	                        (KeyValuePair *)bodyContactHash, 
	                        splitBufLength);
							
	int mxcount = 0;
	max<int>(mxcount, (int *)bodyCount, splitBufLength);
// if(mxcount>9) std::cout<<" max count per contact "<<mxcount; 
	int numiterations = mxcount + 3;
	
	m_splitInverseMass->create(splitBufLength * 4);
	void * splitMass = m_splitInverseMass->bufferOnDevice();
	
	CudaNarrowphase::CombinedObjectBuffer * objectBuf = (CudaNarrowphase::CombinedObjectBuffer *)objectData;
	void * pos = objectBuf->m_pos->bufferOnDevice();
	void * vel = objectBuf->m_vel->bufferOnDevice();
	void * mass = objectBuf->m_mass->bufferOnDevice();
	void * ind = objectBuf->m_ind->bufferOnDevice();
	void * perObjPointStart = objectBuf->m_pointCacheLoc->bufferOnDevice();
	void * perObjectIndexStart = objectBuf->m_indexCacheLoc->bufferOnDevice();
	
	simpleContactSolverComputeSplitInverseMass((float *)splitMass,
	                        (uint2 *)splits,
	                        (uint2 *)pairs,
	                        (float *)mass,
	                        (uint4 *)ind,
	                        (uint * )perObjPointStart,
	                        (uint * )perObjectIndexStart,
                            (uint *)bodyCount,
                            splitBufLength);
	
	m_constraint->create(numContacts * 64);
	void * constraint = m_constraint->bufferOnDevice();
	
	void * contacts = contactBuf->bufferOnDevice();
	
	simpleContactSolverSetContactConstraint((ContactConstraint *)constraint,
	    (uint2 *)splits,
	    (uint2 *)pairs,
	    (float3 *)pos,
	    (float3 *)vel,
	    (uint4 *)ind,
        (uint * )perObjPointStart,
        (uint * )perObjectIndexStart,
        (float *)splitMass,
	    (ContactData *)contacts,
        numContacts * 2);
    CudaBase::CheckCudaError("jacobi solver set constraint");
	
	m_deltaLinearVelocity->create(nextPow2(splitBufLength * 12));
	m_deltaAngularVelocity->create(nextPow2(splitBufLength * 12));
	
	void * deltaLinVel = m_deltaLinearVelocity->bufferOnDevice();
	void * deltaAngVel = m_deltaAngularVelocity->bufferOnDevice();
	simpleContactSolverClearDeltaVelocity((float3 *)deltaLinVel, 
	                            (float3 *)deltaAngVel, 
	                            splitBufLength);
	
	/*
	const unsigned scanBufLength = iRound1024(numContacts * 2);
	m_bodyCount->create(scanBufLength * 4);
	m_scanBodyCount[0]->create(scanBufLength * 4);
	m_scanBodyCount[1]->create(scanBufLength * 4);
	
	
	void * scanResult = m_scanBodyCount[0]->bufferOnDevice();
	void * scanIntermediate = m_scanBodyCount[1]->bufferOnDevice();
	scanExclusive((uint *)scanResult, (uint *)bodyCount, (uint *)scanIntermediate, scanBufLength / 1024, 1024);
	
	const unsigned numSplitBodies = ScanUtil::getScanResult(m_bodyCount, m_scanBodyCount[0], scanBufLength);
	*/
	
	int i;
	for(i=0; i< numiterations; i++) {
// compute impulse and velocity changes per contact
        simpleContactSolverSolveContactWoJ((ContactConstraint *)constraint,
	                    (float3 *)deltaLinVel,
	                    (float3 *)deltaAngVel,
	                    (uint2 *)pairs,
	                    (uint2 *)splits,
	                    (float *)splitMass,
	                    (ContactData *)contacts,
	                    (float3 *)pos,
	                    (float3 *)vel,
	                    (uint4 *)ind,
	                    (uint * )perObjPointStart,
	                    (uint * )perObjectIndexStart,
	                    numContacts * 2);
        CudaBase::CheckCudaError("jacobi solver solve impulse");
    
	    simpleContactSolverAverageVelocities((float3 *)deltaLinVel,
                        (float3 *)deltaAngVel,
                        (uint *)bodyCount,
                        (KeyValuePair *)bodyContactHash, 
                        splitBufLength);
        CudaBase::CheckCudaError("jacobi solver average velocity");
	}
	
// 2 tet per contact, 4 pnt per tet, key is pnt index, value is tet index in split
	const unsigned pntHashBufLength = iRound1024(numContacts * 2 * 4);
    // std::cout<<"\n pntHashBufLength"<<pntHashBufLength
    // <<" numContact"<<numContacts;
	m_pntTetHash[0]->create(pntHashBufLength * 8);
	m_pntTetHash[1]->create(pntHashBufLength * 8);
	
	void * pntTetHash = m_pntTetHash[0]->bufferOnDevice();
	
	simpleContactSolverWritePointTetHash((KeyValuePair *)pntTetHash,
	                (uint2 *)pairs,
	                (uint2 *)splits,
	                (uint *)bodyCount,
	                (uint4 *)ind,
	                (uint * )perObjPointStart,
	                (uint * )perObjectIndexStart,
	                numContacts * 2,
	                pntHashBufLength);
    CudaBase::CheckCudaError(CudaBase::Synchronize(),
                             "jacobi solver point-tetra hash");
    
	void * intermediate = m_pntTetHash[1]->bufferOnDevice();
	RadixSort((KeyValuePair *)pntTetHash, (KeyValuePair *)intermediate, pntHashBufLength, 24);

#if 0
    svlg.writeHash(m_pntTetHash[1], numContacts * 2, 
                   "pnttet_hash", CudaDbgLog::FAlways);
#endif
    
	simpleContactSolverUpdateVelocity((float3 *)vel,
	                (float3 *)deltaLinVel,
	                (float3 *)deltaAngVel,
	                (KeyValuePair *)pntTetHash,
                    (uint2 *)pairs,
                    (uint2 *)splits,
                    (ContactConstraint *)constraint,
                    (ContactData *)contacts,
                    (float3 *)pos,
                    (uint4 *)ind,
                    (uint * )perObjPointStart,
                    (uint * )perObjectIndexStart,
                    numContacts * 2 * 4);
    CudaBase::CheckCudaError(CudaBase::Synchronize(),
        "jacobi solver update velocity");
}