btPersistentManifold* btCollisionDispatcher::getNewManifold(void* b0, void* b1)
{
    gNumManifold++;

    btCollisionObject* body0 = (btCollisionObject*)b0;
    btCollisionObject* body1 = (btCollisionObject*)b1;

    //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
    btScalar contactBreakingThreshold = (m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD) ?
            btMin(body0->getCollisionShape()->getContactBreakingThreshold(gContactBreakingThreshold) , body1->getCollisionShape()->getContactBreakingThreshold(gContactBreakingThreshold))
            : gContactBreakingThreshold ;

    btScalar contactProcessingThreshold = btMin(body0->getContactProcessingThreshold(),body1->getContactProcessingThreshold());

    void* mem = 0;

    if (m_persistentManifoldPoolAllocator->getFreeCount())
    {
        mem = m_persistentManifoldPoolAllocator->allocate(sizeof(btPersistentManifold));
    }
    else
    {
        //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert.
        if ((m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION)==0)
        {
            mem = btAlignedAlloc(sizeof(btPersistentManifold),16);
        } else
        {
            btAssert(0);
            //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration
            return 0;
        }
    }

    btPersistentManifold* manifold = new(mem) btPersistentManifold (body0,body1,0,contactBreakingThreshold,contactProcessingThreshold);
    manifold->m_index1a = m_manifoldsPtr.size();
    m_manifoldsPtr.push_back(manifold);

    return manifold;
}
Exemple #2
0
void PhysicsEngine::stepSimulation() {
    CProfileManager::Reset();
    BT_PROFILE("stepSimulation");
    // NOTE: the grand order of operations is:
    // (1) pull incoming changes
    // (2) step simulation
    // (3) synchronize outgoing motion states
    // (4) send outgoing packets

    const float MAX_TIMESTEP = (float)PHYSICS_ENGINE_MAX_NUM_SUBSTEPS * PHYSICS_ENGINE_FIXED_SUBSTEP;
    float dt = 1.0e-6f * (float)(_clock.getTimeMicroseconds());
    _clock.reset();
    float timeStep = btMin(dt, MAX_TIMESTEP);

    if (_myAvatarController) {
        BT_PROFILE("avatarController");
        // TODO: move this stuff outside and in front of stepSimulation, because
        // the updateShapeIfNecessary() call needs info from MyAvatar and should
        // be done on the main thread during the pre-simulation stuff
        if (_myAvatarController->needsRemoval()) {
            _myAvatarController->setDynamicsWorld(nullptr);

            // We must remove any existing contacts for the avatar so that any new contacts will have
            // valid data.  MyAvatar's RigidBody is the ONLY one in the simulation that does not yet
            // have a MotionState so we pass nullptr to removeContacts().
            removeContacts(nullptr);
        }
        _myAvatarController->updateShapeIfNecessary();
        if (_myAvatarController->needsAddition()) {
            _myAvatarController->setDynamicsWorld(_dynamicsWorld);
        }
        _myAvatarController->preSimulation();
    }

    auto onSubStep = [this]() {
        updateContactMap();
    };

    int numSubsteps = _dynamicsWorld->stepSimulationWithSubstepCallback(timeStep, PHYSICS_ENGINE_MAX_NUM_SUBSTEPS,
                                                                        PHYSICS_ENGINE_FIXED_SUBSTEP, onSubStep);
    if (numSubsteps > 0) {
        BT_PROFILE("postSimulation");
        _numSubsteps += (uint32_t)numSubsteps;
        ObjectMotionState::setWorldSimulationStep(_numSubsteps);

        if (_myAvatarController) {
            _myAvatarController->postSimulation();
        }

        _hasOutgoingChanges = true;
    }
}
void TinyRendererVisualShapeConverter::copyCameraImageData(unsigned char* pixelsRGBA, int rgbaBufferSizeInPixels, 
                                                            float* depthBuffer, int depthBufferSizeInPixels,
                                                            int* segmentationMaskBuffer, int segmentationMaskSizeInPixels,
                                                            int startPixelIndex, int* widthPtr, int* heightPtr, int* numPixelsCopied)
{
    int w = m_data->m_rgbColorBuffer.get_width();
    int h = m_data->m_rgbColorBuffer.get_height();
    
    if (numPixelsCopied)
        *numPixelsCopied = 0;
    
    if (widthPtr)
        *widthPtr = w;
    
    if (heightPtr)
        *heightPtr = h;
    
    int numTotalPixels = w*h;
    int numRemainingPixels = numTotalPixels - startPixelIndex;
    int numBytesPerPixel = 4;//RGBA
    int numRequestedPixels  = btMin(rgbaBufferSizeInPixels,numRemainingPixels);
    if (numRequestedPixels)
    {
        for (int i=0;i<numRequestedPixels;i++)
        {
			if (depthBuffer)
			{
				depthBuffer[i] = m_data->m_depthBuffer[i+startPixelIndex];
			}
			if (segmentationMaskBuffer)
            {
                segmentationMaskBuffer[i] = m_data->m_segmentationMaskBuffer[i+startPixelIndex];
            }
			
            if (pixelsRGBA)
            {
                pixelsRGBA[i*numBytesPerPixel] =   m_data->m_rgbColorBuffer.buffer()[(i+startPixelIndex)*3+0];
                pixelsRGBA[i*numBytesPerPixel+1] = m_data->m_rgbColorBuffer.buffer()[(i+startPixelIndex)*3+1];
                pixelsRGBA[i*numBytesPerPixel+2] = m_data->m_rgbColorBuffer.buffer()[(i+startPixelIndex)*3+2];
                pixelsRGBA[i*numBytesPerPixel+3] = 255;
                
            }
        }
        
        if (numPixelsCopied)
            *numPixelsCopied = numRequestedPixels;
        
    }    
}
void OpenGLGuiHelper::copyCameraImageData(unsigned char* pixelsRGBA, int rgbaBufferSizeInPixels, float* depthBuffer, int depthBufferSizeInPixels, int startPixelIndex, int* widthPtr, int* heightPtr, int* numPixelsCopied)
{
    int w = m_data->m_glApp->m_window->getWidth()*m_data->m_glApp->m_window->getRetinaScale();
    int h = m_data->m_glApp->m_window->getHeight()*m_data->m_glApp->m_window->getRetinaScale();

    if (widthPtr)
        *widthPtr = w;
    if (heightPtr)
        *heightPtr = h;
    if (numPixelsCopied)
        *numPixelsCopied = 0;

    int numTotalPixels = w*h;
    int numRemainingPixels = numTotalPixels - startPixelIndex;
    int numBytesPerPixel = 4;//RGBA
    int numRequestedPixels  = btMin(rgbaBufferSizeInPixels,numRemainingPixels);
    if (numRequestedPixels)
    {
        if (startPixelIndex==0)
        {

            //quick test: render the scene
            getRenderInterface()->renderScene();
            //copy the image into our local cache
            m_data->m_rgbaPixelBuffer.resize(w*h*numBytesPerPixel);
            m_data->m_depthBuffer.resize(w*h);
            m_data->m_glApp->getScreenPixels(&(m_data->m_rgbaPixelBuffer[0]),m_data->m_rgbaPixelBuffer.size());
        }
        for (int i=0; i<numRequestedPixels*numBytesPerPixel; i++)
        {
            if (pixelsRGBA)
            {
                pixelsRGBA[i] = m_data->m_rgbaPixelBuffer[i+startPixelIndex*numBytesPerPixel];
            }
        }

        if (numPixelsCopied)
            *numPixelsCopied = numRequestedPixels;


    }


}
GL_ShapeDrawer::ShapeCache*		GL_ShapeDrawer::cache(btConvexShape* shape)
{
	ShapeCache*		sc=(ShapeCache*)shape->getUserPointer();
	if(!sc)
	{
		sc=new(btAlignedAlloc(sizeof(ShapeCache),16)) ShapeCache(shape);
		sc->m_shapehull.buildHull(shape->getMargin());
		m_shapecaches.push_back(sc);
		shape->setUserPointer(sc);
		/* Build edges	*/ 
		const int			ni=sc->m_shapehull.numIndices();
		const int			nv=sc->m_shapehull.numVertices();
		const unsigned int*	pi=sc->m_shapehull.getIndexPointer();
		const btVector3*	pv=sc->m_shapehull.getVertexPointer();
		btAlignedObjectArray<ShapeCache::Edge*>	edges;
		sc->m_edges.reserve(ni);
		edges.resize(nv*nv,0);
		for(int i=0;i<ni;i+=3)
		{
			const unsigned int* ti=pi+i;
			const btVector3		nrm=btCross(pv[ti[1]]-pv[ti[0]],pv[ti[2]]-pv[ti[0]]).normalized();
			for(int j=2,k=0;k<3;j=k++)
			{
				const unsigned int	a=ti[j];
				const unsigned int	b=ti[k];
				ShapeCache::Edge*&	e=edges[btMin(a,b)*nv+btMax(a,b)];
				if(!e)
				{
					sc->m_edges.push_back(ShapeCache::Edge());
					e=&sc->m_edges[sc->m_edges.size()-1];
					e->n[0]=nrm;e->n[1]=-nrm;
					e->v[0]=a;e->v[1]=b;
				}
				else
				{
					e->n[1]=nrm;
				}
			}
		}
	}
	return(sc);
}
Exemple #6
0
void PhysicsEngine::stepSimulation() {
    CProfileManager::Reset();
    BT_PROFILE("stepSimulation");
    // NOTE: the grand order of operations is:
    // (1) pull incoming changes
    // (2) step simulation
    // (3) synchronize outgoing motion states
    // (4) send outgoing packets

    const float MAX_TIMESTEP = (float)PHYSICS_ENGINE_MAX_NUM_SUBSTEPS * PHYSICS_ENGINE_FIXED_SUBSTEP;
    float dt = 1.0e-6f * (float)(_clock.getTimeMicroseconds());
    _clock.reset();
    float timeStep = btMin(dt, MAX_TIMESTEP);

    // TODO: move character->preSimulation() into relayIncomingChanges
    if (_characterController) {
        if (_characterController->needsRemoval()) {
            _characterController->setDynamicsWorld(nullptr);
        }
        _characterController->updateShapeIfNecessary();
        if (_characterController->needsAddition()) {
            _characterController->setDynamicsWorld(_dynamicsWorld);
        }
        _characterController->preSimulation(timeStep);
    }

    int numSubsteps = _dynamicsWorld->stepSimulation(timeStep, PHYSICS_ENGINE_MAX_NUM_SUBSTEPS, PHYSICS_ENGINE_FIXED_SUBSTEP);
    if (numSubsteps > 0) {
        BT_PROFILE("postSimulation");
        _numSubsteps += (uint32_t)numSubsteps;
        ObjectMotionState::setWorldSimulationStep(_numSubsteps);

        if (_characterController) {
            _characterController->postSimulation();
        }
        updateContactMap();
        _hasOutgoingChanges = true;
    }
}
void btGpuDemo3dOCLWrap::runKernelWithWorkgroupSize(int kernelId, int globalSize)
{
	if(globalSize <= 0)
	{
		return;
	}
	cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
	cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
	int workgroupSize = m_kernels[kernelId].m_workgroupSize;
	if(workgroupSize <= 0)
	{ // let OpenCL library calculate workgroup size
		size_t globalWorkSize[2];
		globalWorkSize[0] = globalSize;
		globalWorkSize[1] = 1;
		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
	}
	else
	{
		size_t localWorkSize[2], globalWorkSize[2];
		workgroupSize = btMin(workgroupSize, globalSize);
		int num_t = globalSize / workgroupSize;
		int num_g = num_t * workgroupSize;
		if(num_g < globalSize)
		{
			num_t++;
		}
		localWorkSize[0]  = workgroupSize;
		globalWorkSize[0] = num_t * workgroupSize;
		localWorkSize[1] = 1;
		globalWorkSize[1] = 1;
		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
	}
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
	ciErrNum = clFlush(m_cqCommandQue);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
Exemple #8
0
btScalar duWater::getWaterLevel(btScalar pos_x,
                                btScalar pos_z,
                                int wrapperNum)
{

    // get water wrapper by index
    duWater::WaterWrapper *wWrapper = wrapperByInd(wrapperNum);
    if (!wWrapper)
        return btScalar(0.0);

    if (!wWrapper->wavesHeight)
        return wWrapper->waterLevel;

    float time = m_time;

    ////////// DISTANT WAVES //////////
    // first component
    float noise_coords[2];

    duWaterDynInfo* dynInfo = wWrapper->dynamicsInfo;

    noise_coords[0] = dynInfo->dst_noise_scale0 *
                          (pos_x + dynInfo->dst_noise_freq0 * time);
    noise_coords[1] = dynInfo->dst_noise_scale0 * 
                          (pos_z + dynInfo->dst_noise_freq0 * time);
    float noise1 = snoise(noise_coords);

    // second component
    noise_coords[0] = dynInfo->dst_noise_scale1 *
                          (pos_z - dynInfo->dst_noise_freq1 * time);
    noise_coords[1] = dynInfo->dst_noise_scale1 * 
                          (pos_x - dynInfo->dst_noise_freq1 * time);
    float noise2 = snoise(noise_coords);

    float dist_waves = wWrapper->wavesHeight * noise1 * noise2;

    float wave_height;

    if (wWrapper->wDistArray) {
        ////////// SHORE WAVES //////////
        // get coordinates in texture pixels
        double x = (pos_x - wWrapper->shoreMapCenterX) / wWrapper->shoreMapSizeX;
        double z = (wWrapper->shoreMapCenterZ + pos_z) / wWrapper->shoreMapSizeZ;
        x += 0.5f;
        z += 0.5f;

        // if position is out of boundings, consider that shore dist = 1
        if (x > 1.f || x < 0.f || z > 1.f || z < 0.f)
            wave_height = dist_waves;
        else {
            // get coordinates in pixels
            int array_width = wWrapper->shoreMapTexSize;
            x *= array_width - .5f;
            z *= array_width - .5f;

            double floor_px;
            double floor_py;
            float fract_px = modf(x, &floor_px);
            float fract_py = modf(z, &floor_py);

            int px = static_cast<int>(floor_px);
            int py = static_cast<int>(floor_py);

            btScalar *distArray = wWrapper->wDistArray;

            int up_lim = array_width - 1;

            float dist00 = distArray[py * array_width + px];
            float dist10 = distArray[py * array_width + btMin(px + 1, up_lim)];
            float dist01 = distArray[btMin(py + 1, up_lim) * array_width + px];
            float dist11 = distArray[btMin(py + 1, up_lim) * array_width + btMin(px + 1, up_lim)];

            // distance on bottom, top edge
            float dist0010 = dist00 * (1.f - fract_px) + dist10 * fract_px;
            float dist0111 = dist01 * (1.f - fract_px) + dist11 * fract_px;

            float shore_dist = dist0010 * (1.f - fract_py) + dist0111 * fract_py;

            float shore_waves_length = wWrapper->wavesLength / float(wWrapper->maxShoreDist) / M_PI;

            float waves_coords[2] = {dynInfo->dir_noise_scale *
                                 (pos_x + dynInfo->dir_noise_freq * time),
                                     dynInfo->dir_noise_scale *
                                 (pos_z + dynInfo->dir_noise_freq * time)};

            float dist_fact = sqrt(shore_dist);

            float shore_dir_waves = wWrapper->wavesHeight
                * fmax(shore_dist, dynInfo->dir_min_shore_fac)
                * sinf((dist_fact / shore_waves_length + dynInfo->dir_freq * time))
                * fmax(snoise(waves_coords), dynInfo->dir_min_noise_fac);
            // mix two types of waves basing on distance to the shore
            float mix_rate = btMax(dist_fact, dynInfo->dst_min_fac);
            wave_height = shore_dir_waves * (1 - mix_rate) + dist_waves * mix_rate;
        }
    } else
        wave_height = dist_waves;

    btScalar cur_water_level = wWrapper->waterLevel + wave_height;
    return cur_water_level;
}
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
    cl_context cxGPUContext;       // OpenCL context
    cl_command_queue cqCommandQue;  // OpenCL command que
    cl_device_id* cdDevices;        // OpenCL device list    
    cl_program cpProgram;           // OpenCL program
    cl_kernel ckKernel;             // OpenCL kernel
    cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
    size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
    size_t szLocalWorkSize[1];		// 1D var for # of work items in the work group	
    size_t szParmDataBytes;			// Byte size of context information
    cl_int ciErr1, ciErr2;			// Error code var
    int iTestN = 100000 * 8;		// Size of Vectors to process

	int actualGlobalSize = iTestN>>3;
	
    // set Global and Local work size dimensions
    szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
    szLocalWorkSize[0]= iTestN>>3;
	
	
    // Allocate and initialize host arrays
    srcA = (void *)malloc (sizeof(cl_float) * iTestN);
    srcB = (void *)malloc (sizeof(cl_float) * iTestN);
    dst = (void *)malloc (sizeof(cl_float) * iTestN);

	int i;

	// Initialize arrays with some values
	for (i=0;i<iTestN;i++)
	{
		((cl_float*)srcA)[i] = cl_float(i);
		((cl_float*)srcB)[i] = 2;
		((cl_float*)dst)[i]=-1;
	}


	 cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

	cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    // Create OpenCL context & context
    cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
	
    // Query all devices available to the context
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
    cdDevices = (cl_device_id*)malloc(szParmDataBytes);
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
	if (cdDevices)
	{
		printDevInfo(cdDevices[0]);
	}

    // Create a command queue for first device the context reported
    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
    ciErr1 |= ciErr2; 

    // Allocate the OpenCL source and result buffer memory objects on the device GMEM
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
    ciErr1 |= ciErr2;

///create kernels from binary
	int numDevices = 1;
	::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
	const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));

	for (i = 0; i < numDevices; ++i) {
		images[i] = 0;
		lengths[i] = 0;
	}

	
	// Read the OpenCL kernel in from source file
	const char* cSourceFile = "VectorAddKernels.cl";
	
    printf("loadProgSource (%s)...\n", cSourceFile); 
    const char* cPathAndName = cSourceFile;
#ifdef LOAD_FROM_FILE
	size_t szKernelLength;
    const char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength);
#else
	const char* cSourceCL = stringifiedSourceCL;
	size_t szKernelLength = strlen(stringifiedSourceCL);
#endif //LOAD_FROM_FILE


	
    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    printf("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        exit(0);
    }
	
    // Build the program with 'mad' Optimization option
#ifdef MAC
	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
#else
	const char* flags = "-DGUID_ARG=";
#endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
    printf("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        exit(0);
    }
	
    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
    printf("clCreateKernel (VectorAdd)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		exit(0);
    }
	
	
	cl_int ciErrNum;
	
	ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
	if (ciErrNum != CL_SUCCESS)
	{
		printf("cannot get workgroup size\n");
		exit(0);
	}

	

   
    // Set the Argument values
    ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);

	
	
	int workgroupSize = wgSize;
	if(workgroupSize <= 0)
	{ // let OpenCL library calculate workgroup size
		size_t globalWorkSize[2];
		globalWorkSize[0] = actualGlobalSize;
		globalWorkSize[1] = 1;
	
		// Copy input data from host to GPU and launch kernel 
		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );

	}
	else
	{
		size_t localWorkSize[2], globalWorkSize[2];
		workgroupSize = btMin(workgroupSize, actualGlobalSize);
		int num_t = actualGlobalSize / workgroupSize;
		int num_g = num_t * workgroupSize;
		if(num_g < actualGlobalSize)
		{
			num_t++;
			//this can cause problems -> processing outside of the buffer
			//make sure to check kernel
		}

		size_t globalThreads[] = {num_t * workgroupSize};
		size_t localThreads[] = {workgroupSize};


		localWorkSize[0]  = workgroupSize;
		globalWorkSize[0] = num_t * workgroupSize;
		localWorkSize[1] = 1;
		globalWorkSize[1] = 1;

		// Copy input data from host to GPU and launch kernel 
		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);

	}
	
	if (ciErrNum != CL_SUCCESS)
	{
		printf("cannot clEnqueueNDRangeKernel\n");
		exit(0);
	}
	
	clFinish(cqCommandQue);
    // Read back results and check accumulated errors
    ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);

    // Release kernel, program, and memory objects
	// NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    free(cdDevices);
	clReleaseKernel(ckKernel);  
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQue);
    clReleaseContext(cxGPUContext);


    // print the results
    int iErrorCount = 0;
    for (i = 0; i < iTestN; i++) 
    {
		if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
			iErrorCount++;
    }
	
	if (iErrorCount)
	{
		printf("MiniCL validation FAILED\n");
	} else
	{
		printf("MiniCL validation SUCCESSFULL\n");
	}
    // Free host memory, close log and return success
	for (i = 0; i < 3; i++)
    {
        clReleaseMemObject(cmMemObjs[i]);
    }

    free(srcA); 
    free(srcB);
    free (dst);
	printf("Press ENTER to quit\n");
	getchar();
}
Exemple #10
0
void Vehicle::SetTargetSteering(float steering)
{
	steering = btMin(btMax(-1.f, steering), 1.f);//clamp
	targetSteering = steering * steeringClamp;
}
Exemple #11
0
void ObjectMotionState::setAngularDamping(float damping) {
    _angularDamping = btMax(btMin(fabsf(damping), 1.0f), 0.0f);
}
Exemple #12
0
void ObjectMotionState::setLinearDamping(float damping) {
    _linearDamping = btMax(btMin(fabsf(damping), 1.0f), 0.0f);
}
Exemple #13
0
void ObjectMotionState::setRestitution(float restitution) {
    _restitution = btMax(btMin(fabsf(restitution), 1.0f), 0.0f);
}
void btFractureDynamicsWorld::glueCallback()
{

	int numManifolds = getDispatcher()->getNumManifolds();

	///first build the islands based on axis aligned bounding box overlap

	btUnionFind unionFind;

	int index = 0;
	{

		int i;
		for (i=0;i<getCollisionObjectArray().size(); i++)
		{
			btCollisionObject*   collisionObject= getCollisionObjectArray()[i];
		//	btRigidBody* body = btRigidBody::upcast(collisionObject);
			//Adding filtering here
#ifdef STATIC_SIMULATION_ISLAND_OPTIMIZATION
			if (!collisionObject->isStaticOrKinematicObject())
			{
				collisionObject->setIslandTag(index++);
			} else
			{
				collisionObject->setIslandTag(-1);
			}
#else
			collisionObject->setIslandTag(i);
			index=i+1;
#endif
		}
	}

	unionFind.reset(index);

	int numElem = unionFind.getNumElements();

	for (int i=0;i<numManifolds;i++)
	{
		btPersistentManifold* manifold = getDispatcher()->getManifoldByIndexInternal(i);
		if (!manifold->getNumContacts())
			continue;

		btScalar minDist = 1e30f;
		for (int v=0;v<manifold->getNumContacts();v++)
		{
			minDist = btMin(minDist,manifold->getContactPoint(v).getDistance());
		}
		if (minDist>0.)
			continue;
		
		btCollisionObject* colObj0 = (btCollisionObject*)manifold->getBody0();
		btCollisionObject* colObj1 = (btCollisionObject*)manifold->getBody1();
		int tag0 = (colObj0)->getIslandTag();
		int tag1 = (colObj1)->getIslandTag();
		//btRigidBody* body0 = btRigidBody::upcast(colObj0);
		//btRigidBody* body1 = btRigidBody::upcast(colObj1);


		if (!colObj0->isStaticOrKinematicObject() && !colObj1->isStaticOrKinematicObject())
		{
			unionFind.unite(tag0, tag1);
		}
	}




	numElem = unionFind.getNumElements();



	index=0;
	for (int ai=0;ai<getCollisionObjectArray().size();ai++)
	{
		btCollisionObject* collisionObject= getCollisionObjectArray()[ai];
		if (!collisionObject->isStaticOrKinematicObject())
		{
			int tag = unionFind.find(index);

			collisionObject->setIslandTag( tag);

			//Set the correct object offset in Collision Object Array
#if STATIC_SIMULATION_ISLAND_OPTIMIZATION
			unionFind.getElement(index).m_sz = ai;
#endif //STATIC_SIMULATION_ISLAND_OPTIMIZATION

			index++;
		}
	}
	unionFind.sortIslands();



	int endIslandIndex=1;
	int startIslandIndex;

	btAlignedObjectArray<btCollisionObject*> removedObjects;

	///iterate over all islands
	for ( startIslandIndex=0;startIslandIndex<numElem;startIslandIndex = endIslandIndex)
	{
		int islandId = unionFind.getElement(startIslandIndex).m_id;
		for (endIslandIndex = startIslandIndex+1;(endIslandIndex<numElem) && (unionFind.getElement(endIslandIndex).m_id == islandId);endIslandIndex++)
		{
		}

		int fractureObjectIndex = -1;

		int numObjects=0;

		int idx;
		for (idx=startIslandIndex;idx<endIslandIndex;idx++)
		{
			int i = unionFind.getElement(idx).m_sz;
			btCollisionObject* colObj0 = getCollisionObjectArray()[i];
			if (colObj0->getInternalType()& CUSTOM_FRACTURE_TYPE)
			{
				fractureObjectIndex = i;
			}
			btRigidBody* otherObject = btRigidBody::upcast(colObj0);
			if (!otherObject || !otherObject->getInvMass())
				continue;
			numObjects++;
		}

		///Then for each island that contains at least two objects and one fracture object
		if (fractureObjectIndex>=0 && numObjects>1)
		{

			btFractureBody* fracObj = (btFractureBody*)getCollisionObjectArray()[fractureObjectIndex];

			///glueing objects means creating a new compound and removing the old objects
			///delay the removal of old objects to avoid array indexing problems
			removedObjects.push_back(fracObj);
			m_fractureBodies.remove(fracObj);

			btAlignedObjectArray<btScalar> massArray;

			btAlignedObjectArray<btVector3> oldImpulses;
			btAlignedObjectArray<btVector3> oldCenterOfMassesWS;

			oldImpulses.push_back(fracObj->getLinearVelocity()/1./fracObj->getInvMass());
			oldCenterOfMassesWS.push_back(fracObj->getCenterOfMassPosition());

			btScalar totalMass = 0.f;


			btCompoundShape* compound = new btCompoundShape();
			if (fracObj->getCollisionShape()->isCompound())
			{
				btTransform tr;
				tr.setIdentity();
				btCompoundShape* oldCompound = (btCompoundShape*)fracObj->getCollisionShape();
				for (int c=0;c<oldCompound->getNumChildShapes();c++)
				{
					compound->addChildShape(oldCompound->getChildTransform(c),oldCompound->getChildShape(c));
					massArray.push_back(fracObj->m_masses[c]);
					totalMass+=fracObj->m_masses[c];
				}

			} else
			{
				btTransform tr;
				tr.setIdentity();
				compound->addChildShape(tr,fracObj->getCollisionShape());
				massArray.push_back(fracObj->m_masses[0]);
				totalMass+=fracObj->m_masses[0];
			}

			for (idx=startIslandIndex;idx<endIslandIndex;idx++)
			{

				int i = unionFind.getElement(idx).m_sz;

				if (i==fractureObjectIndex)
					continue;

				btCollisionObject* otherCollider = getCollisionObjectArray()[i];

				btRigidBody* otherObject = btRigidBody::upcast(otherCollider);
				//don't glue/merge with static objects right now, otherwise everything gets stuck to the ground
				///todo: expose this as a callback
				if (!otherObject || !otherObject->getInvMass())
					continue;


				oldImpulses.push_back(otherObject->getLinearVelocity()*(1.f/otherObject->getInvMass()));
				oldCenterOfMassesWS.push_back(otherObject->getCenterOfMassPosition());

				removedObjects.push_back(otherObject);
				m_fractureBodies.remove((btFractureBody*)otherObject);

				btScalar curMass = 1.f/otherObject->getInvMass();


				if (otherObject->getCollisionShape()->isCompound())
				{
					btTransform tr;
					btCompoundShape* oldCompound = (btCompoundShape*)otherObject->getCollisionShape();
					for (int c=0;c<oldCompound->getNumChildShapes();c++)
					{
						tr = fracObj->getWorldTransform().inverseTimes(otherObject->getWorldTransform()*oldCompound->getChildTransform(c));
						compound->addChildShape(tr,oldCompound->getChildShape(c));
						massArray.push_back(curMass/(btScalar)oldCompound->getNumChildShapes());

					}
				} else
				{
					btTransform tr;
					tr = fracObj->getWorldTransform().inverseTimes(otherObject->getWorldTransform());
					compound->addChildShape(tr,otherObject->getCollisionShape());
					massArray.push_back(curMass);
				}
				totalMass+=curMass;
			}



			btTransform shift;
			shift.setIdentity();
			btCompoundShape* newCompound = btFractureBody::shiftTransformDistributeMass(compound,totalMass,shift);
			int numChildren = newCompound->getNumChildShapes();
			btAssert(numChildren == massArray.size());

			btVector3 localInertia;
			newCompound->calculateLocalInertia(totalMass,localInertia);
			btFractureBody* newBody = new btFractureBody(totalMass,0,newCompound,localInertia, &massArray[0], numChildren,this);
			newBody->recomputeConnectivity(this);
			newBody->setWorldTransform(fracObj->getWorldTransform()*shift);

			//now the linear/angular velocity is still zero, apply the impulses

			for (int i=0;i<oldImpulses.size();i++)
			{
				btVector3 rel_pos = oldCenterOfMassesWS[i]-newBody->getCenterOfMassPosition();
				const btVector3& imp = oldImpulses[i];
				newBody->applyImpulse(imp, rel_pos);
			}

			addRigidBody(newBody);


		}


	}

	//remove the objects from the world at the very end, 
	//otherwise the island tags would not match the world collision object array indices anymore
	while (removedObjects.size())
	{
		btCollisionObject* otherCollider = removedObjects[removedObjects.size()-1];
		removedObjects.pop_back();

		btRigidBody* otherObject = btRigidBody::upcast(otherCollider);
		if (!otherObject || !otherObject->getInvMass())
			continue;
		removeRigidBody(otherObject);
	}

}
void PhysicsClientExample::stepSimulation(float deltaTime)
{
	if (m_options == eCLIENTEXAMPLE_SERVER)
	{
		for (int i = 0; i < 100; i++)
		{
			m_physicsServer.processClientCommands();
		}
	}

	if (m_prevSelectedBody != m_selectedBody)
	{
		createButtons();
		m_prevSelectedBody = m_selectedBody;
	}

	//while (!b3CanSubmitCommand(m_physicsClientHandle))
	{
		b3SharedMemoryStatusHandle status = b3ProcessServerStatus(m_physicsClientHandle);
		bool hasStatus = (status != 0);
		if (hasStatus)
		{
			int statusType = b3GetStatusType(status);
			if (statusType == CMD_ACTUAL_STATE_UPDATE_COMPLETED)
			{
				//b3Printf("bla\n");
			}
			if (statusType == CMD_CAMERA_IMAGE_COMPLETED)
			{
				//	static int counter=0;
				//	char msg[1024];
				//	sprintf(msg,"Camera image %d OK\n",counter++);
				b3CameraImageData imageData;
				b3GetCameraImageData(m_physicsClientHandle, &imageData);
				if (m_canvas)
				{
					//compute depth image range
					float minDepthValue = 1e20f;
					float maxDepthValue = -1e20f;

					for (int i = 0; i < camVisualizerWidth; i++)
					{
						for (int j = 0; j < camVisualizerHeight; j++)
						{
							int xIndex = int(float(i) * (float(imageData.m_pixelWidth) / float(camVisualizerWidth)));
							int yIndex = int(float(j) * (float(imageData.m_pixelHeight) / float(camVisualizerHeight)));
							btClamp(xIndex, 0, imageData.m_pixelWidth);
							btClamp(yIndex, 0, imageData.m_pixelHeight);

							if (m_canvasDepthIndex >= 0)
							{
								int depthPixelIndex = (xIndex + yIndex * imageData.m_pixelWidth);
								float depthValue = imageData.m_depthValues[depthPixelIndex];
								//todo: rescale the depthValue to [0..255]
								if (depthValue > -1e20)
								{
									maxDepthValue = btMax(maxDepthValue, depthValue);
									minDepthValue = btMin(minDepthValue, depthValue);
								}
							}
						}
					}

					for (int i = 0; i < camVisualizerWidth; i++)
					{
						for (int j = 0; j < camVisualizerHeight; j++)
						{
							int xIndex = int(float(i) * (float(imageData.m_pixelWidth) / float(camVisualizerWidth)));
							int yIndex = int(float(j) * (float(imageData.m_pixelHeight) / float(camVisualizerHeight)));
							btClamp(yIndex, 0, imageData.m_pixelHeight);
							btClamp(xIndex, 0, imageData.m_pixelWidth);
							int bytesPerPixel = 4;  //RGBA

							if (m_canvasRGBIndex >= 0)
							{
								int rgbPixelIndex = (xIndex + yIndex * imageData.m_pixelWidth) * bytesPerPixel;
								m_canvas->setPixel(m_canvasRGBIndex, i, j,
												   imageData.m_rgbColorData[rgbPixelIndex],
												   imageData.m_rgbColorData[rgbPixelIndex + 1],
												   imageData.m_rgbColorData[rgbPixelIndex + 2],
												   255);  //alpha set to 255
							}

							if (m_canvasDepthIndex >= 0)
							{
								int depthPixelIndex = (xIndex + yIndex * imageData.m_pixelWidth);
								float depthValue = imageData.m_depthValues[depthPixelIndex];
								//todo: rescale the depthValue to [0..255]
								if (depthValue > -1e20)
								{
									int rgb = 0;

									if (maxDepthValue != minDepthValue)
									{
										rgb = (depthValue - minDepthValue) * (255. / (btFabs(maxDepthValue - minDepthValue)));
										if (rgb < 0 || rgb > 255)
										{
											//printf("rgb=%d\n",rgb);
										}
									}
									m_canvas->setPixel(m_canvasDepthIndex, i, j,
													   rgb,
													   rgb,
													   255, 255);  //alpha set to 255
								}
								else
								{
									m_canvas->setPixel(m_canvasDepthIndex, i, j,
													   0,
													   0,
													   0, 255);  //alpha set to 255
								}
							}
							if (m_canvasSegMaskIndex >= 0 && (0 != imageData.m_segmentationMaskValues))
							{
								int segmentationMaskPixelIndex = (xIndex + yIndex * imageData.m_pixelWidth);
								int segmentationMask = imageData.m_segmentationMaskValues[segmentationMaskPixelIndex];
								btVector4 palette[4] = {btVector4(32, 255, 32, 255),
														btVector4(32, 32, 255, 255),
														btVector4(255, 255, 32, 255),
														btVector4(32, 255, 255, 255)};
								if (segmentationMask >= 0)
								{
									int obIndex = segmentationMask & ((1 << 24) - 1);
									int linkIndex = (segmentationMask >> 24) - 1;

									btVector4 rgb = palette[(obIndex + linkIndex) & 3];
									m_canvas->setPixel(m_canvasSegMaskIndex, i, j,
													   rgb.x(),
													   rgb.y(),
													   rgb.z(), 255);  //alpha set to 255
								}
								else
								{
									m_canvas->setPixel(m_canvasSegMaskIndex, i, j,
													   0,
													   0,
													   0, 255);  //alpha set to 255
								}
							}
						}
					}
					if (m_canvasRGBIndex >= 0)
						m_canvas->refreshImageData(m_canvasRGBIndex);
					if (m_canvasDepthIndex >= 0)
						m_canvas->refreshImageData(m_canvasDepthIndex);
					if (m_canvasSegMaskIndex >= 0)
						m_canvas->refreshImageData(m_canvasSegMaskIndex);
				}
Exemple #16
0
btScalar Epa::calcPenDepth( btPoint3& wWitnessOnA, btPoint3& wWitnessOnB )
{
	btVector3 v;

	btScalar upperBoundSqrd = SIMD_INFINITY;
	btScalar vSqrd = 0;
#ifdef _DEBUG
	btScalar prevVSqrd;
#endif
	btScalar delta;

	bool isCloseEnough = false;

	EpaFace* pEpaFace = NULL;

	int nbIterations = 0;
	//int nbMaxIterations = 1000;

	do
	{
		pEpaFace = m_faceEntries.front();
		std::pop_heap( m_faceEntries.begin(), m_faceEntries.end(), CompareEpaFaceEntries );
		m_faceEntries.pop_back();

		if ( !pEpaFace->m_deleted )
		{
#ifdef _DEBUG
			prevVSqrd = vSqrd;
#endif

			vSqrd = pEpaFace->m_vSqrd;

			if ( pEpaFace->m_planeDistance >= 0 )
			{
				v = pEpaFace->m_planeNormal;
			}
			else
			{
				v = pEpaFace->m_v;
			}

#ifdef _DEBUG
			//assert_msg( vSqrd <= upperBoundSqrd, "A triangle was falsely rejected!" );
			EPA_DEBUG_ASSERT( ( vSqrd >= prevVSqrd ) ,"vSqrd decreased!" );
#endif //_DEBUG
			EPA_DEBUG_ASSERT( ( v.length2() > 0 ) ,"Zero vector not allowed!" );

			btVector3 seperatingAxisInA =  v * m_transformA.getBasis();
			btVector3 seperatingAxisInB = -v * m_transformB.getBasis();

			btVector3 p = m_pConvexShapeA->localGetSupportingVertex( seperatingAxisInA );
			btVector3 q = m_pConvexShapeB->localGetSupportingVertex( seperatingAxisInB );

			btPoint3 pWorld = m_transformA( p );
			btPoint3 qWorld = m_transformB( q );

			btPoint3 w = pWorld - qWorld;
			delta = v.dot( w );

			// Keep tighest upper bound
			upperBoundSqrd = btMin( upperBoundSqrd, delta * delta / vSqrd );
			//assert_msg( vSqrd <= upperBoundSqrd, "A triangle was falsely rejected!" );

			isCloseEnough = ( upperBoundSqrd <= ( 1 + 1e-4f ) * vSqrd );

			if ( !isCloseEnough )
			{
				std::list< EpaFace* > newFaces;
				bool expandOk = m_polyhedron.Expand( w, pWorld, qWorld, pEpaFace, newFaces );

				if ( expandOk )
				{
					EPA_DEBUG_ASSERT( !newFaces.empty() ,"EPA polyhedron not expanding ?" );

					bool check    = true;
					bool areEqual = false;

					while ( !newFaces.empty() )
					{
						EpaFace* pNewFace = newFaces.front();
						EPA_DEBUG_ASSERT( !pNewFace->m_deleted ,"New face is deleted!" );

						if ( !pNewFace->m_deleted )
						{
							EPA_DEBUG_ASSERT( ( pNewFace->m_vSqrd > 0 ) ,"Face containing the origin!" );
							EPA_DEBUG_ASSERT( !pNewFace->IsAffinelyDependent() ,"Face is affinely dependent!" );

//#ifdef EPA_POLYHEDRON_USE_PLANES
////							if ( pNewFace->m_planeDistance >= 0 )
////							{
//								// assert( false && "Face's plane distance greater than 0!" );
//#ifdef _DEBUG
////								m_polyhedron._dbgSaveToFile( "epa_beforeFix.dbg" );
//#endif
//								//pNewFace->FixOrder();
//#ifdef _DEBUG
//								//m_polyhedron._dbgSaveToFile( "epa_afterFix.dbg" );
//#endif
////							}
//#endif
//
//#ifdef EPA_POLYHEDRON_USE_PLANES
//							//assert( ( pNewFace->m_planeDistance < 0 ) && "Face's plane distance equal or greater than 0!" );
//#endif

							if ( pNewFace->IsClosestPointInternal() && ( vSqrd <= pNewFace->m_vSqrd ) && ( pNewFace->m_vSqrd <= upperBoundSqrd ) )
							{
								m_faceEntries.push_back( pNewFace );
								std::push_heap( m_faceEntries.begin(), m_faceEntries.end(), CompareEpaFaceEntries );
							}
						}

						newFaces.pop_front();
					}
				}
				else
				{
					pEpaFace->CalcClosestPointOnA( wWitnessOnA );
					pEpaFace->CalcClosestPointOnB( wWitnessOnB );

#ifdef _DEBUG
					//m_polyhedron._dbgSaveToFile( "epa_end.dbg" );
#endif

					return v.length();
				}
			}
		}

		++nbIterations;
	}
	while ( ( m_polyhedron.GetNbFaces() < EPA_MAX_FACE_ENTRIES ) &&/*( nbIterations < nbMaxIterations ) &&*/
			!isCloseEnough && ( m_faceEntries.size() > 0 ) && ( m_faceEntries[ 0 ]->m_vSqrd <= upperBoundSqrd ) );

#ifdef _DEBUG
	//m_polyhedron._dbgSaveToFile( "epa_end.dbg" );
#endif

	EPA_DEBUG_ASSERT( pEpaFace ,"Invalid epa face!" );

	pEpaFace->CalcClosestPointOnA( wWitnessOnA );
	pEpaFace->CalcClosestPointOnB( wWitnessOnB );

	return v.length();
}
Exemple #17
0
void btDbvtBroadphase::collide(btDispatcher* dispatcher)
{
	/*printf("---------------------------------------------------------\n");
	printf("m_sets[0].m_leaves=%d\n",m_sets[0].m_leaves);
	printf("m_sets[1].m_leaves=%d\n",m_sets[1].m_leaves);
	printf("numPairs = %d\n",getOverlappingPairCache()->getNumOverlappingPairs());
	{
		int i;
		for (i=0;i<getOverlappingPairCache()->getNumOverlappingPairs();i++)
		{
			printf("pair[%d]=(%d,%d),",i,getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy0->getUid(),
				getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy1->getUid());
		}
		printf("\n");
	}
*/

	SPC(m_profiling.m_total);
	/* optimize				*/
	m_sets[0].optimizeIncremental(1 + (m_sets[0].m_leaves * m_dupdates) / 100);
	if (m_fixedleft)
	{
		const int count = 1 + (m_sets[1].m_leaves * m_fupdates) / 100;
		m_sets[1].optimizeIncremental(1 + (m_sets[1].m_leaves * m_fupdates) / 100);
		m_fixedleft = btMax<int>(0, m_fixedleft - count);
	}
	/* dynamic -> fixed set	*/
	m_stageCurrent = (m_stageCurrent + 1) % STAGECOUNT;
	btDbvtProxy* current = m_stageRoots[m_stageCurrent];
	if (current)
	{
#if DBVT_BP_ACCURATESLEEPING
		btDbvtTreeCollider collider(this);
#endif
		do
		{
			btDbvtProxy* next = current->links[1];
			listremove(current, m_stageRoots[current->stage]);
			listappend(current, m_stageRoots[STAGECOUNT]);
#if DBVT_BP_ACCURATESLEEPING
			m_paircache->removeOverlappingPairsContainingProxy(current, dispatcher);
			collider.proxy = current;
			btDbvt::collideTV(m_sets[0].m_root, current->aabb, collider);
			btDbvt::collideTV(m_sets[1].m_root, current->aabb, collider);
#endif
			m_sets[0].remove(current->leaf);
			ATTRIBUTE_ALIGNED16(btDbvtVolume)
			curAabb = btDbvtVolume::FromMM(current->m_aabbMin, current->m_aabbMax);
			current->leaf = m_sets[1].insert(curAabb, current);
			current->stage = STAGECOUNT;
			current = next;
		} while (current);
		m_fixedleft = m_sets[1].m_leaves;
		m_needcleanup = true;
	}
	/* collide dynamics		*/
	{
		btDbvtTreeCollider collider(this);
		if (m_deferedcollide)
		{
			SPC(m_profiling.m_fdcollide);
			m_sets[0].collideTTpersistentStack(m_sets[0].m_root, m_sets[1].m_root, collider);
		}
		if (m_deferedcollide)
		{
			SPC(m_profiling.m_ddcollide);
			m_sets[0].collideTTpersistentStack(m_sets[0].m_root, m_sets[0].m_root, collider);
		}
	}
	/* clean up				*/
	if (m_needcleanup)
	{
		SPC(m_profiling.m_cleanup);
		btBroadphasePairArray& pairs = m_paircache->getOverlappingPairArray();
		if (pairs.size() > 0)
		{
			int ni = btMin(pairs.size(), btMax<int>(m_newpairs, (pairs.size() * m_cupdates) / 100));
			for (int i = 0; i < ni; ++i)
			{
				btBroadphasePair& p = pairs[(m_cid + i) % pairs.size()];
				btDbvtProxy* pa = (btDbvtProxy*)p.m_pProxy0;
				btDbvtProxy* pb = (btDbvtProxy*)p.m_pProxy1;
				if (!Intersect(pa->leaf->volume, pb->leaf->volume))
				{
#if DBVT_BP_SORTPAIRS
					if (pa->m_uniqueId > pb->m_uniqueId)
						btSwap(pa, pb);
#endif
					m_paircache->removeOverlappingPair(pa, pb, dispatcher);
					--ni;
					--i;
				}
			}
			if (pairs.size() > 0)
				m_cid = (m_cid + ni) % pairs.size();
			else
				m_cid = 0;
		}
	}
	++m_pid;
	m_newpairs = 1;
	m_needcleanup = false;
	if (m_updates_call > 0)
	{
		m_updates_ratio = m_updates_done / (btScalar)m_updates_call;
	}
	else
	{
		m_updates_ratio = 0;
	}
	m_updates_done /= 2;
	m_updates_call /= 2;
}
	void EXPORT_API ProjectInternalCollisionConstraintsWithFriction(btVector3* positions, btVector3* predictedPositions, bool* posLocks, btVector3* temp, int pointsCount, int* neighbours, int* pointNeighboursCount, int maxNeighboursPerPoint, float Ks_prime, float radius, float mass, float staticFric, float kineticFric)
	{
		btScalar massInv = 1.0f / mass;
		//  float Ks_prime  = 1.0f - Mathf.Pow((1.0f - Ks),  1.0f / solverIterations);

		float radiusSum = radius + radius;
		float radiusSumSq = radiusSum * radiusSum;

		//#pragma omp parallel for
		for (int idA = 0; idA < pointsCount; idA++)
		{

		//	int collisionsCount = 0;
			for (int nId = 0; nId < pointNeighboursCount[idA]; nId++)
			{
				int idB = neighbours[idA * maxNeighboursPerPoint + nId];

				btVector3 dir = predictedPositions[idA] - predictedPositions[idB];
				float distanceSq = dir.length2();

				if (idA == idB || distanceSq > radiusSumSq || distanceSq <= FLT_EPSILON)
					continue;

				float distance = btSqrt(distanceSq);

				btScalar w1 = posLocks[idA] ? 0.0f : massInv;
				btScalar w2 = posLocks[idB] ? 0.0f : massInv;

				float invMassSum = w1 + w2;

				btVector3 dP = (1.0f / invMassSum) * (distance - radiusSum) * (dir / distance) * Ks_prime;

				predictedPositions[idA] -= dP * w1;
				predictedPositions[idB] += dP * w2;

				// Apply friction
				btVector3 nf = dir / distance;
				btVector3 dpf = (predictedPositions[idA] - positions[idA]) - (predictedPositions[idB] - positions[idB]);
				btVector3 dpt = dpf - btDot(dpf, nf) * nf;

				float d = radiusSum - distance;
				float ldpt = dpt.length();
				if (ldpt < FLT_EPSILON)
					continue;


				if (ldpt < staticFric * d) 
				{
					btVector3 delta = dpt / invMassSum;
					predictedPositions[idA] -= delta * w1;
					predictedPositions[idB] += delta * w2;
				}
				else
				{
					btVector3 delta = dpt * btMin(kineticFric * d / ldpt, 1.0f) / invMassSum;
					predictedPositions[idA] -= delta * w1;
					predictedPositions[idB] += delta * w2;
				}
			}

		}



	}
Exemple #19
0
void ObjectMotionState::updateBodyMaterialProperties() {
    _body->setRestitution(getObjectRestitution());
    _body->setFriction(getObjectFriction());
    _body->setDamping(fabsf(btMin(getObjectLinearDamping(), 1.0f)), fabsf(btMin(getObjectAngularDamping(), 1.0f)));
}
Exemple #20
0
    int EXPORT_API ProjectSoftbodyCollisionConstraintsWithFriction(IBroadphase* tree, btVector3* posA, btVector3* predPosA, float radius, int pointsCount, float KsA, btVector3* posB, btVector3* predPosB, Triangle* triangles, float KsB, bool twoSidedCollisions, float staticFric, float kineticFric)
    {
        int collisionTriIds[MAX_COLCOUNT];
        int narrowPhaseColCount = 0;
        btScalar radiusSq = radius * radius;
        btScalar invMass = 1.0f;

        //#pragma omp parallel for
        for (int k = 0; k < pointsCount; k++)
        {
            btVector3 totalResponse(0, 0, 0);
            int broadphaseColCount = tree->IntersectSphere(predPosA[k], radius, collisionTriIds);

            for (int i = 0; i < broadphaseColCount; i++)
            {

                Triangle tri = triangles[collisionTriIds[i]];

                const btVector3 pA = predPosB[tri.pointAid];
                const btVector3 pB = predPosB[tri.pointBid];
                const btVector3 pC = predPosB[tri.pointCid];

                btVector3 bar;
                Barycentric2(pA, pB, pC, predPosA[k], bar);

                btVector3 contactPoint = pA * bar.x() + pB * bar.y() + pC * bar.z();

                btVector3 triNormal;
                calculateNormal(pA, pB, pC, triNormal);

                btVector3 normal = predPosA[k] - contactPoint;

                float distSq = normal.length2();
                if (distSq > radiusSq)
                    continue;

                float dist = btSqrt(distSq);
                if (dist < 0.00001f)
                    continue;

                normal /= dist;

                if (!twoSidedCollisions && btDot(normal, triNormal) < 0)
                {
                    normal = -normal;
                    continue;
                }

                btScalar C = radius - dist;
                btScalar s = invMass + invMass * bar.x() * bar.x() + invMass * bar.y() * bar.y() + invMass * bar.z() * bar.z();
                if (s == 0.0f)
                    return narrowPhaseColCount;

                btVector3 responseVec = normal * C / s;

                predPosA[k] += responseVec * KsA;

                predPosB[tri.pointAid] -= responseVec * bar.x() * KsB;
                predPosB[tri.pointBid] -= responseVec * bar.y()  * KsB;
                predPosB[tri.pointCid] -= responseVec * bar.z()  * KsB;

                //     Vector4 contactPointPred = contactPoint - responseVec;
                btVector3 contactPointPred = predPosB[tri.pointAid] * bar.x() + predPosB[tri.pointBid] * bar.y() + predPosB[tri.pointCid] * bar.z();
                btVector3 contactPointInitial = posB[tri.pointAid] * bar.x() + posB[tri.pointBid] * bar.y() + posB[tri.pointCid] * bar.z();

                btVector3 nf = normal;
                btVector3 dpf = (predPosA[k] - posA[k]) - (contactPointPred - contactPointInitial);
                btVector3 dpt = dpf - btDot(dpf, nf) * nf;

                //   float d = radiusSum - distance;
                float ldpt = dpt.length();

                if (ldpt < 0.00001f)
                    continue;


                if (ldpt < staticFric * C)
                {
                    //   Vector3 delta = dpt / invMassSum;
                    btVector3 delta = dpt / s;

                    predPosA[k] -= delta * invMass * KsA;
                    predPosB[tri.pointAid] += delta * bar.x() * invMass * KsB;
                    predPosB[tri.pointBid] += delta * bar.y() * invMass * KsB;
                    predPosB[tri.pointCid] += delta * bar.z() * invMass * KsB;
                }
                else
                {
                    //     Vector3 delta = dpt * Mathf.min(kineticFric * d / ldpt, 1.0f) / invMassSum;
                    btVector3 delta = dpt * btMin(kineticFric * C / ldpt, 1.0f) / s;

                    predPosA[k] -= delta * invMass * KsA;
                    predPosB[tri.pointAid] += delta * bar.x() * invMass * KsB;
                    predPosB[tri.pointBid] += delta * bar.y() * invMass * KsB;
                    predPosB[tri.pointCid] += delta * bar.z() * invMass * KsB;
                }

                totalResponse += responseVec;
                narrowPhaseColCount++;
            }


        }

        return narrowPhaseColCount;
    }
Exemple #21
0
void ObjectMotionState::setFriction(float friction) {
    _friction = btMax(btMin(fabsf(friction), MAX_FRICTION), 0.0f);
}
Exemple #22
0
    int EXPORT_API ProcessSpheresCollisionsVsMovingMesh(IBroadphase* tree, btVector3* posA, btVector3* predPosA, float radius, int pointsCount, btVector3* posB, btVector3* predPosB, Triangle* triangles, bool twoSidedCollisions, float staticFric, float kineticFric)
    {
        int collisionTriIds[MAX_COLCOUNT];
        int narrowPhaseColCount = 0;
        btScalar radiusSq = radius * radius;
        btScalar invMass = 1.0f;

        //#pragma omp parallel for
        for (int k = 0; k < pointsCount; k++)
        {
            btVector3 totalResponse(0, 0, 0);
            int broadphaseColCount = tree->IntersectSphere(predPosA[k], radius, collisionTriIds);

            for (int i = 0; i < broadphaseColCount; i++)
            {

                Triangle tri = triangles[collisionTriIds[i]];

                const btVector3 pA = predPosB[tri.pointAid];
                const btVector3 pB = predPosB[tri.pointBid];
                const btVector3 pC = predPosB[tri.pointCid];

                btVector3 bar;
                Barycentric2(pA, pB, pC, predPosA[k], bar);

                btVector3 contactPoint = pA * bar.x() + pB * bar.y() + pC * bar.z();

                btVector3 triNormal;
                calculateNormal(pA, pB, pC, triNormal);

                btVector3 normal = predPosA[k] - contactPoint;

                float distSq = normal.length2();
                if (distSq > radiusSq)
                    continue;

                float dist = btSqrt(distSq);
                if (dist < 0.00001f)
                    continue;

                normal /= dist;

                if (!twoSidedCollisions && btDot(normal, triNormal) < 0)
                {
                    normal = -normal;
                    continue;
                }

                btScalar C = radius - dist;

                btVector3 responseVec = normal * C;

                predPosA[k] += responseVec;

                //FRICTION
                btVector3 contactPointPred = predPosB[tri.pointAid] * bar.x() + predPosB[tri.pointBid] * bar.y() + predPosB[tri.pointCid] * bar.z();
                btVector3 contactPointInitial = posB[tri.pointAid] * bar.x() + posB[tri.pointBid] * bar.y() + posB[tri.pointCid] * bar.z();

                btVector3 dpf = (predPosA[k] - posA[k]) - (contactPointPred - contactPointInitial);
                btVector3 dpt = dpf - btDot(dpf, normal) * normal;

                float ldpt = dpt.length();

                if (ldpt < 0.000001f)
                    continue;

                if (ldpt < staticFric * C)
                    predPosA[k] -= dpt;
                else
                    predPosA[k] -= dpt * btMin(kineticFric * C / ldpt, 1.0f);

                totalResponse += responseVec;
                narrowPhaseColCount++;
            }

        }

        return narrowPhaseColCount;
    }
	void EXPORT_API ProjectInternalCollisionConstraintsWithFrictionMT(btVector3* positions, btVector3* predictedPositions, bool* posLocks, btVector3* temp, int pointsCount, int* neighbours, int* pointNeighboursCount, int maxNeighboursPerPoint, float Ks_prime, float radius, float mass, float staticFric, float kineticFric)
	{
		btScalar massInv = 1.0f / mass;
		//  float Ks_prime  = 1.0f - Mathf.Pow((1.0f - Ks),  1.0f / solverIterations);

		float radiusSum = radius + radius;
		//float radiusSum = radius * 2.0001 ;
		float radiusSumSq = radiusSum * radiusSum + 0.0001f;


		#pragma omp parallel
		{

			#pragma omp for
			for (int idA = 0; idA < pointsCount; idA++)
			{
				btVector3 deltaP(0, 0, 0);
				int collisionsCount = 0;
				btVector3 posA = predictedPositions[idA];
				for (int nId = 0; nId < pointNeighboursCount[idA]; nId++)
				{
					int idB = neighbours[idA * maxNeighboursPerPoint + nId];
					btVector3 posB = predictedPositions[idB];
					btVector3 dir = posA - posB;
					float distanceSq = dir.length2();

					if (idA == idB || distanceSq > radiusSumSq || distanceSq <= FLT_EPSILON)
						continue;

					float distance = btSqrt(distanceSq);

					btScalar w1 = posLocks[idA] ? 0.0f : massInv;
					btScalar w2 = posLocks[idB] ? 0.0f : massInv;

					float invMassSum = w1 + w2;

					float scale = (distance - radiusSum) / invMassSum * Ks_prime;
					btVector3 dP = scale * (dir / distance);
					btVector3 dPA = -dP * w1 / (btScalar)pointNeighboursCount[idA];
				//	btVector3 dPB =  dP * w2 / pointNeighboursCount[idA]; //???
					btVector3 dPB = dP * w2 / (btScalar)pointNeighboursCount[idB]; //???

					deltaP += dPA;

					collisionsCount++;

				//	uint neighborIndex = gridParticleIndex[neighbors[index * MAX_FLUID_NEIGHBORS + i]];
				//	float3 prevPos2 = make_float3(prevPositions[neighborIndex]);
					//        float3 currPos2 = make_float3(newPos[neighbors[index * MAX_FLUID_NEIGHBORS + i]]);

				//	float3 nf = normalize(diff);

					//float3 dpt = dpRel - dot(dpRel, nf) * nf;
					//float ldpt = length(dpt);

					//if (ldpt < EPS)
					//	continue;

					//if (ldpt < (S_FRICTION)* dist)
					//	delta -= dpt * colW / (colW + colW2);
					//else
					//	delta -= dpt * min((K_FRICTION)* dist / ldpt, 1.f);

					// Apply friction
					btVector3 nf = dir / distance;
					//	float3 dpRel = (pos + dp1 - prevPos) - (prevPos + dp2 - prevPos2);
					//	btVector3 dpf = (predictedPositions[idA] - positions[idA]) - (predictedPositions[idB] - positions[idB]);
					btVector3 dpRel = (posA + dPA - positions[idA]) - (posB + dPB - positions[idB]);

					btVector3 dpt = dpRel - btDot(dpRel, nf) * nf;

					float d = radiusSum - distance;
					//float d = distance;
					float ldpt = dpt.length();
					if (ldpt < FLT_EPSILON)
						continue;

					if (ldpt < staticFric * d)
						deltaP -= dpt * w1 / invMassSum;
					else
						deltaP -= dpt * btMin(kineticFric * d / ldpt, 1.0f) * w1 / invMassSum;
					//	deltaP -= dpt * btMin(kineticFric * distance / ldpt, 1.0f);
					
				}

				temp[idA] = deltaP;

				//if (collisionsCount > 0)
				//	temp[idA] = deltaP /  (btScalar)collisionsCount;
				////	temp[idA] = deltaP;
				//else
				//	temp[idA] = btVector3(0, 0, 0);
			}

			#pragma omp for
			for (int i = 0; i < pointsCount; i++)
			{
				predictedPositions[i] += temp[i];
			}

		}
	}
void btGpu3DGridBroadphase::_initialize(	const btVector3& worldAabbMin,const btVector3& worldAabbMax, 
										int gridSizeX, int gridSizeY, int gridSizeZ, 
										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
										int maxBodiesPerCell,
										btScalar cellFactorAABB)
{
	// set various paramerers
	m_ownsPairCache = true;
	m_params.m_gridSizeX = gridSizeX;
	m_params.m_gridSizeY = gridSizeY;
	m_params.m_gridSizeZ = gridSizeZ;
	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
	btVector3 w_org = worldAabbMin;
	m_params.m_worldOriginX = w_org.getX();
	m_params.m_worldOriginY = w_org.getY();
	m_params.m_worldOriginZ = w_org.getZ();
	btVector3 w_size = worldAabbMax - worldAabbMin;
	m_params.m_cellSizeX = w_size.getX() / m_params.m_gridSizeX;
	m_params.m_cellSizeY = w_size.getY() / m_params.m_gridSizeY;
	m_params.m_cellSizeZ = w_size.getZ() / m_params.m_gridSizeZ;
	m_maxRadius = btMin(btMin(m_params.m_cellSizeX, m_params.m_cellSizeY), m_params.m_cellSizeZ);
	m_maxRadius *= btScalar(0.5f);
	m_params.m_numBodies = m_numBodies;
	m_params.m_maxBodiesPerCell = maxBodiesPerCell;

	m_numLargeHandles = 0;						
	m_maxLargeHandles = maxLargeProxies;

	m_maxPairsPerBody = maxPairsPerBody;

	m_cellFactorAABB = cellFactorAABB;

	m_LastLargeHandleIndex = -1;

    assert(!m_bInitialized);
    // allocate host storage
    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));

    m_hCellStart = new unsigned int[m_params.m_numCells];
    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));

	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
	// --------------- for now, init with m_maxPairsPerBody for each body
	m_hPairBuffStartCurr[0] = 0;
	m_hPairBuffStartCurr[1] = 0;
	for(int i = 1; i <= m_maxHandles; i++) 
	{
		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
		m_hPairBuffStartCurr[i * 2 + 1] = 0;
	}
	//----------------
	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
	m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max

	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?

	m_hPairScan = new unsigned int[m_maxHandles + 1];

	m_hPairOut = new unsigned int[m_maxHandles * m_maxPairsPerBody];

// large proxies

	// allocate handles buffer and put all handles on free list
	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
	m_firstFreeLargeHandle = 0;
	{
		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
		{
			m_pLargeHandles[i].SetNextFree(i + 1);
			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
		}
		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
	}

// debug data
	m_numPairsAdded = 0;
	m_numOverflows = 0;

    m_bInitialized = true;
}
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
{
	
	int originalSize = keyValuesInOut.size();
	int workingSize = originalSize;
	
			
	int dataAlignment = DATA_ALIGNMENT;

#ifdef DEBUG_RADIXSORT2
    btAlignedObjectArray<btSortData>   test2;
    keyValuesInOut.copyToHost(test2);
    printf("numElem = %d\n",test2.size());
    for (int i=0;i<test2.size();i++)
    {
        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
    }
#endif //DEBUG_RADIXSORT2
    
	btOpenCLArray<btSortData>* src = 0;

	if (workingSize%dataAlignment)
	{
		workingSize += dataAlignment-(workingSize%dataAlignment);
		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
		m_workBuffer4->resize(workingSize);
		btSortData fillValue;
		fillValue.m_key = 0xffffffff;
		fillValue.m_value = 0xffffffff;

#define USE_BTFILL
#ifdef USE_BTFILL
		m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
#else
		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
		
		for (int i=originalSize; i<workingSize;i++)
		{
			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
		}
#endif//USE_BTFILL

		src = m_workBuffer4;
	} else
	{
		src = &keyValuesInOut;
		m_workBuffer4->resize(0);
	}
		
	btAssert( workingSize%DATA_ALIGNMENT == 0 );
	int minCap = NUM_BUCKET*NUM_WGS;


	int n = workingSize;

	m_workBuffer1->resize(minCap);
	m_workBuffer3->resize(workingSize);
	

//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
	btAssert( BITS_PER_PASS == 4 );
	btAssert( WG_SIZE == 64 );
	btAssert( (sortBits&0x3) == 0 );

	
	
	btOpenCLArray<btSortData>* dst = m_workBuffer3;

	btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
	btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;


	int nWGs = NUM_WGS;
	btConstData cdata;

	{
        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
     	int nBlocks = (n+blockSize-1)/(blockSize);
		cdata.m_n = n;
		cdata.m_nWGs = NUM_WGS;
		cdata.m_startBit = 0;
		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
		if( nBlocks < NUM_WGS )
		{
			cdata.m_nBlocksPerWG = 1;
			nWGs = nBlocks;
		}
	}

	int count=0;
	for(int ib=0; ib<sortBits; ib+=4)
	{
#ifdef DEBUG_RADIXSORT2
        keyValuesInOut.copyToHost(test2);
        printf("numElem = %d\n",test2.size());
        for (int i=0;i<test2.size();i++)
        {
            if (test2[i].m_key != test2[i].m_value)
            {
                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
            }
        }
#endif //DEBUG_RADIXSORT2
        
		cdata.m_startBit = ib;
		
		{
			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
			btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);

			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst(  cdata );
			
			int num = NUM_WGS*WG_SIZE;
			launcher.launch1D( num, WG_SIZE );
		}

        
        
#ifdef DEBUG_RADIXSORT
		btAlignedObjectArray<unsigned int> testHist;
		srcHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
#endif //DEBUG_RADIXSORT
	
	

//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
	bool fastScan=true;
#else
	bool fastScan=false;
#endif

		if (fastScan)
		{//	prefix scan group histogram
			btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
			btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( 128, 128 );
			destHisto = srcHisto;
		}else
		{
			//unsigned int sum; //for debugging
            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
		}


#ifdef DEBUG_RADIXSORT
		destHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
        
        for (int i=0;i<testHist.size();i+=NUM_WGS)
		{
				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
		}

#endif //DEBUG_RADIXSORT

#define USE_GPU
#ifdef USE_GPU
        
		{//	local sort and distribute
			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
			btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
            
		}
#else
        {
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            int tables[NUM_TABLES];
            int startBit = ib;
            
            destHisto->copyToHost(testHist);
            btAlignedObjectArray<btSortData> srcHost;
            btAlignedObjectArray<btSortData> dstHost;
            dstHost.resize(src->size());
            
            src->copyToHost(srcHost);
            
            for (int i=0;i<NUM_TABLES;i++)
            {
                tables[i] = testHist[i*NUM_WGS];
            }
            
            //	distribute
            for(int i=0; i<n; i++)
            {
                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                
                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
                counter2[tableIdx] ++;
            }
            
            
#else
          
            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            
            int tables[NUM_TABLES];
             btAlignedObjectArray<btSortData> dstHostOK;
            dstHostOK.resize(src->size());

            destHisto->copyToHost(testHist);
            btAlignedObjectArray<btSortData> srcHost;
            src->copyToHost(srcHost);
        
            int blockSize = 256;
            int nBlocksPerWG = cdata.m_nBlocksPerWG;
            int startBit = ib;

            {
                for (int i=0;i<NUM_TABLES;i++)
                {
                    tables[i] = testHist[i*NUM_WGS];
                }
                
                //	distribute
                for(int i=0; i<n; i++)
                {
                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                    
                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
                    counter2[tableIdx] ++;
                }

            
            }
            
            
            btAlignedObjectArray<btSortData> dstHost;
            dstHost.resize(src->size());
            
            
            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            
            
            
            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
            {
              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
                
              for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
              {
                for (int lIdx = 0;lIdx < 64;lIdx++)
                {
                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
                    
                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
                    //	AMD: AtomInc performs better while NV prefers ++
                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
                    {
                        if( addr+j < n )
                        {
                          //  printf ("addr+j=%d\n", addr+j);
                            
                            int i = addr+j;
                            
                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
                            
                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
                            
                            btSortData ok = dstHostOK[destIndex];
                                                    
                            if (ok.m_key != srcHost[i].m_key)
                            {
                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
                            }
                            if (ok.m_value != srcHost[i].m_value)
                            {
                                
                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );

                            }
                   
                            dstHost[destIndex] = srcHost[i];
                            counter[tableIdx] ++;
                            
                        }
                    }
                }
              }
            }
            
         
#endif //SEQUENTIAL
            
            dst->copyFromHost(dstHost);
        }
#endif//USE_GPU
        
        
        
#ifdef DEBUG_RADIXSORT
		destHisto->copyToHost(testHist);
		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
		for (int i=0;i<testHist.size();i++)
		{
			if (testHist[i]!=0)
				printf("testHist[%d]=%d\n",i,testHist[i]);
		}
#endif //DEBUG_RADIXSORT
		btSwap(src, dst );
		btSwap(srcHisto,destHisto);

#ifdef DEBUG_RADIXSORT2
        keyValuesInOut.copyToHost(test2);
        printf("numElem = %d\n",test2.size());
        for (int i=0;i<test2.size();i++)
        {
            if (test2[i].m_key != test2[i].m_value)
            {
                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
            }
        }
#endif //DEBUG_RADIXSORT2
        
        count++;
                
        
	}