void btParticlesDynamicsWorld::runComputeCellIdKernel()
{
    cl_int ciErrNum;
#if 0
	if(m_useCpuControls[SIMSTAGE_COMPUTE_CELL_ID]->m_active)
	{	// CPU version
		unsigned int memSize = sizeof(btVector3) * m_numParticles;
		ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
		for(int index = 0; index < m_numParticles; index++)
		{
			btVector3 pos = m_hPos[index];
			btInt4 gridPos = cpu_getGridPos(pos, &m_simParams);
			unsigned int hash = cpu_getPosHash(gridPos, &m_simParams);
			m_hPosHash[index].x = hash;
			m_hPosHash[index].y = index;
		}
		memSize = sizeof(btInt2) * m_numParticles;
		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	else
#endif
	{
		BT_PROFILE("ComputeCellId");
		runKernelWithWorkgroupSize(PARTICLES_KERNEL_COMPUTE_CELL_ID, m_numParticles);
		ciErrNum = clFinish(m_cqCommandQue);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
/*
	// check
	int memSize = sizeof(btInt2) * m_hashSize;
    ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
    oclCHECKERROR(ciErrNum, CL_SUCCESS);

	memSize = sizeof(float) * 4 * m_numParticles;
    ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
    oclCHECKERROR(ciErrNum, CL_SUCCESS);
*/

	{
		BT_PROFILE("Copy VBO");
		// Explicit Copy (until OpenGL interop will work)
		// map the PBO to copy data from the CL buffer via host
		glBindBufferARB(GL_ARRAY_BUFFER, m_vbo);    
		// map the buffer object into client's memory
		void* ptr = glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY_ARB);
		ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, sizeof(float) * 4 * m_numParticles, ptr, 0, NULL, NULL);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
		glUnmapBufferARB(GL_ARRAY_BUFFER); 
		glBindBufferARB(GL_ARRAY_BUFFER,0);
	}
}
void btParticlesDynamicsWorld::runCollideParticlesKernel()
{
	cl_int ciErrNum;
	if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active)
	{	// CPU version
		int memSize = sizeof(btVector3) * m_numParticles;
		{
			BT_PROFILE("Copy from GPU");
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
			memSize = sizeof(btInt2) * m_numParticles;
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
			memSize = m_numGridCells * sizeof(int);
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
		}
		for(int index = 0; index < m_numParticles; index++)
		{
			btVector3 posA = m_hSortedPos[index];
			btVector3 velA = m_hSortedVel[index];
			btVector3 force = btVector3(0, 0, 0);
			float particleRad = m_simParams.m_particleRad;
			float collisionDamping = m_simParams.m_collisionDamping;
			float spring = m_simParams.m_spring;
			float shear = m_simParams.m_shear;
			float attraction = m_simParams.m_attraction;
			int unsortedIndex = m_hPosHash[index].y;
			//Get address in grid
			btInt4 gridPosA = cpu_getGridPos(posA, &m_simParams);
			//Accumulate surrounding cells
			btInt4 gridPosB; 
			for(int z = -1; z <= 1; z++)
			{
				gridPosB.z = gridPosA.z + z;
				for(int y = -1; y <= 1; y++)
				{
					gridPosB.y = gridPosA.y + y;
					for(int x = -1; x <= 1; x++)
					{
						gridPosB.x = gridPosA.x + x;
						//Get start particle index for this cell
						unsigned int hashB = cpu_getPosHash(gridPosB, &m_simParams);
						int startI = m_hCellStart[hashB];
						//Skip empty cell
						if(startI < 0)
						{
							continue;
						}
						//Iterate over particles in this cell
						int endI = startI + 8;
						for(int j = startI; j < endI; j++)
						{
							unsigned int hashC = m_hPosHash[j].x;
							if(hashC != hashB)
							{
								break;
							}
							if(j == index)
							{
								continue;
							}
							btVector3 posB = m_hSortedPos[j];
							btVector3 velB = m_hSortedVel[j];
							//Collide two spheres
							force += cpu_collideTwoParticles(	posA, posB, velA, velB, particleRad, particleRad, 
																spring, collisionDamping, shear, attraction);
						}
					}
				}
			}     
			//Write new velocity back to original unsorted location
			m_hVel[unsortedIndex] = velA + force;
		}	
		memSize = sizeof(btVector3) * m_numParticles;
		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	else
	{
		runKernelWithWorkgroupSize(PARTICLES_KERNEL_COLLIDE_PARTICLES, m_numParticles);
		cl_int ciErrNum = clFinish(m_cqCommandQue);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
}
void btParticlesDynamicsWorld::runCollideParticlesKernel()
{
	btAlignedObjectArray<int>	pairs;

	float particleRad = m_simParams.m_particleRad;
	float collideDist2 = (particleRad + particleRad)*(particleRad + particleRad);
	cl_int ciErrNum;
	if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active)
	{	// CPU version
		int memSize = sizeof(btVector3) * m_numParticles;
		{
			BT_PROFILE("Copy from GPU");
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
			memSize = sizeof(btInt2) * m_numParticles;
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
			memSize = m_numGridCells * sizeof(int);
			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
		}

		for(int index = 0; index < m_numParticles; index++)
		{
			btVector3 posA = m_hSortedPos[index];
			btVector3 velA = m_hSortedVel[index];
			btVector3 force = btVector3(0, 0, 0);
			float particleRad = m_simParams.m_particleRad;
			float collisionDamping = m_simParams.m_collisionDamping;
			float spring = m_simParams.m_spring;
			float shear = m_simParams.m_shear;
			float attraction = m_simParams.m_attraction;
			int unsortedIndex = m_hPosHash[index].y;
			//Get address in grid
			btInt4 gridPosA = cpu_getGridPos(posA, &m_simParams);
			//Accumulate surrounding cells
			btInt4 gridPosB; 
			for(int z = -1; z <= 1; z++)
			{
				gridPosB.z = gridPosA.z + z;
				for(int y = -1; y <= 1; y++)
				{
					gridPosB.y = gridPosA.y + y;
					for(int x = -1; x <= 1; x++)
					{
						gridPosB.x = gridPosA.x + x;
						//Get start particle index for this cell
						unsigned int hashB = cpu_getPosHash(gridPosB, &m_simParams);
						int startI = m_hCellStart[hashB];
						//Skip empty cell
						if(startI < 0)
						{
							continue;
						}
						//Iterate over particles in this cell
						int endI = startI + 32;
						if(endI > m_numParticles) 
							endI = m_numParticles;

						for(int j = startI; j < endI; j++)
						{
							unsigned int hashC = m_hPosHash[j].x;
							if(hashC != hashB)
							{
								break;
							}
							if(j == index)
							{
								continue;
							}

							btPair pair;
							pair.v0[0] = index;
							pair.v0[1] = j;
							pairs.push_back(pair.value);

//							printf("index=%d, j=%d\n",index,j);
//							printf("(index=%d, j=%d) ",index,j);
							btVector3 posB = m_hSortedPos[j];
							btVector3 velB = m_hSortedVel[j];
							//Collide two spheres
							force += cpu_collideTwoParticles(	posA, posB, velA, velB, particleRad, particleRad, 
																spring, collisionDamping, shear, attraction);
						}
					}
				}
			}     
			//Write new velocity back to original unsorted location
			m_hVel[unsortedIndex] = velA + force;
		}	

//#define BRUTE_FORCE_CHECK 1
#ifdef BRUTE_FORCE_CHECK
		for(int index = 0; index < m_numParticles; index++)
		{
			btVector3 posA = m_hSortedPos[index];
			btVector3 velA = m_hSortedVel[index];
			btVector3 force = btVector3(0, 0, 0);
			int unsortedIndex = m_hPosHash[index].y;
			
			float collisionDamping = m_simParams.m_collisionDamping;
			float spring = m_simParams.m_spring;
			float shear = m_simParams.m_shear;
			float attraction = m_simParams.m_attraction;
			for(int j = 0 ; j < m_numParticles; j++)
			{
				if (index!=j)
				{
					btVector3 posB = m_hSortedPos[j];
					btVector3 velB = m_hSortedVel[j];


					btVector3  relPos = posB - posA; relPos[3] = 0.f;
					float        dist2 = (relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]);
					

					
					if(dist2 < collideDist2)
					{
										//Collide two spheres
						//				force += cpu_collideTwoParticles(	posA, posB, velA, velB, particleRad, particleRad, 
						//													spring, collisionDamping, shear, attraction);

						btPair pair;
						pair.v0[0] = index;
						pair.v0[1] = j;
						if (pairs.findLinearSearch(pair.value)==pairs.size())
						{
							printf("not found index=%d, j=%d\n",index,j);
						} 

										
					}
				}
			}
			//Write new velocity back to original unsorted location
			//m_hVel[unsortedIndex] = velA + force;
		}
#endif //BRUTE_FORCE_CHECK

		memSize = sizeof(btVector3) * m_numParticles;
		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	else
	{
		runKernelWithWorkgroupSize(PARTICLES_KERNEL_COLLIDE_PARTICLES, m_numParticles);
		cl_int ciErrNum = clFinish(m_cqCommandQue);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
}