virtual bool fragment(Vec3f bar, TGAColor &color) { //B3_PROFILE("fragment"); Vec4f p = m_viewportMat*(varying_tri_light_view*bar); float depth = p[2]; p = p/p[3]; float index_x = b3Max(float(0.0), b3Min(float(m_width-1), p[0])); float index_y = b3Max(float(0.0), b3Min(float(m_height-1), p[1])); int idx = int(index_x) + int(index_y)*m_width; // index in the shadowbuffer array float shadow = 0.8+0.2*(m_shadowBuffer->at(idx)<-depth+0.05); // magic coeff to avoid z-fighting Vec3f bn = (varying_nrm*bar).normalize(); Vec2f uv = varying_uv*bar; Vec3f reflection_direction = (bn * (bn * m_light_dir_local * 2.f) - m_light_dir_local).normalize(); float specular = pow(b3Max(reflection_direction.z, 0.f), m_model->specular(uv)); float diffuse = b3Max(0.f, bn * m_light_dir_local); color = m_model->diffuse(uv); color[0] *= m_colorRGBA[0]; color[1] *= m_colorRGBA[1]; color[2] *= m_colorRGBA[2]; color[3] *= m_colorRGBA[3]; for (int i = 0; i < 3; ++i) { color[i] = b3Min(int(m_ambient_coefficient*color[i] + shadow*(m_diffuse_coefficient*diffuse+m_specular_coefficient*specular)*color[i]*m_light_color[i]), 255); } return false; }
bool b3BroadPhase::QueryCallback(i32 proxyId) { if (proxyId == m_queryProxyId) { // The proxy can't overlap with itself. return true; } // Check capacity. if (m_pairBufferCount == m_pairBufferCapacity) { // Duplicate capacity. m_pairBufferCapacity *= 2; b3Pair* oldPairBuffer = m_pairBuffer; m_pairBuffer = (b3Pair*)::b3Alloc(m_pairBufferCapacity * sizeof(b3Pair)); ::memcpy(m_pairBuffer, oldPairBuffer, m_pairBufferCount * sizeof(b3Pair)); ::b3Free(oldPairBuffer); } // Add overlapping pair to the pair buffer. m_pairBuffer[m_pairBufferCount].proxy1 = b3Min(proxyId, m_queryProxyId); m_pairBuffer[m_pairBufferCount].proxy2 = b3Max(proxyId, m_queryProxyId); ++m_pairBufferCount; // Keep looking for overlapping pairs. return true; }
bool b3BroadPhase::Report(u32 proxyId) { if (proxyId == m_queryProxyId) { // The proxy can't overlap with itself. return true; } // Check capacity. if (m_pairCount == m_pairCapacity) { // Duplicate capacity. m_pairCapacity *= 2; b3Pair* oldPairs = m_pairs; m_pairs = (b3Pair*)b3Alloc(m_pairCapacity * sizeof(b3Pair)); memcpy(m_pairs, oldPairs, m_pairCount * sizeof(b3Pair)); b3Free(oldPairs); } // Add overlapping pair to the pair buffer. m_pairs[m_pairCount].proxy1 = b3Min(proxyId, m_queryProxyId); m_pairs[m_pairCount].proxy2 = b3Max(proxyId, m_queryProxyId); ++m_pairCount; // Keep looking for overlapping pairs. return true; }
virtual bool fragment(Vec3f bar, TGAColor &color) { Vec3f bn = (varying_nrm*bar).normalize(); Vec2f uv = varying_uv*bar; Vec3f reflection_direction = (bn * (bn * m_light_dir_local * 2.f) - m_light_dir_local).normalize(); float specular = pow(b3Max(reflection_direction.z, 0.f), m_model->specular(uv)); float diffuse = b3Max(0.f, bn * m_light_dir_local); float ambient_coefficient = 0.6; float diffuse_coefficient = 0.35; float specular_coefficient = 0.05; float intensity = ambient_coefficient + b3Min(diffuse * diffuse_coefficient + specular * specular_coefficient, 1.0f - ambient_coefficient); color = m_model->diffuse(uv) * intensity; //warning: bgra color is swapped to rgba to upload texture color.bgra[0] *= m_colorRGBA[0]; color.bgra[1] *= m_colorRGBA[1]; color.bgra[2] *= m_colorRGBA[2]; color.bgra[3] *= m_colorRGBA[3]; color.bgra[0] *= m_light_color[0]; color.bgra[1] *= m_light_color[1]; color.bgra[2] *= m_light_color[2]; return false; }
virtual bool fragment(Vec3f bar, TGAColor &color) { Vec3f bn = (varying_nrm*bar).normalize(); Vec2f uv = varying_uv*bar; mat<3,3,float> A; A[0] = ndc_tri.col(1) - ndc_tri.col(0); A[1] = ndc_tri.col(2) - ndc_tri.col(0); A[2] = bn; mat<3,3,float> AI = A.invert(); Vec3f i = AI * Vec3f(varying_uv[0][1] - varying_uv[0][0], varying_uv[0][2] - varying_uv[0][0], 0); Vec3f j = AI * Vec3f(varying_uv[1][1] - varying_uv[1][0], varying_uv[1][2] - varying_uv[1][0], 0); mat<3,3,float> B; B.set_col(0, i.normalize()); B.set_col(1, j.normalize()); B.set_col(2, bn); Vec3f n = (B*m_model->normal(uv)).normalize(); float diff = b3Min(b3Max(0.f, n*m_light_dir_local+0.3f),1.f); //float diff = b3Max(0.f, n*m_light_dir_local); color = m_model->diffuse(uv)*diff; return false; }
static inline void b3SolveContact(b3ContactConstraint4& cs, const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, float maxRambdaDt[4], float minRambdaDt[4]) { b3Vector3 dLinVelA; dLinVelA.setZero(); b3Vector3 dAngVelA; dAngVelA.setZero(); b3Vector3 dLinVelB; dLinVelB.setZero(); b3Vector3 dAngVelB; dAngVelB.setZero(); for(int ic=0; ic<4; ic++) { // dont necessary because this makes change to 0 if( cs.m_jacCoeffInv[ic] == 0.f ) continue; { b3Vector3 angular0, angular1, linear; b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA; b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB; b3SetLinearAndAngular( (const b3Vector3 &)-cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 ); float rambdaDt = b3CalcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1, linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic]; rambdaDt *= cs.m_jacCoeffInv[ic]; { float prevSum = cs.m_appliedRambdaDt[ic]; float updated = prevSum; updated += rambdaDt; updated = b3Max( updated, minRambdaDt[ic] ); updated = b3Min( updated, maxRambdaDt[ic] ); rambdaDt = updated - prevSum; cs.m_appliedRambdaDt[ic] = updated; } b3Vector3 linImp0 = invMassA*linear*rambdaDt; b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; #ifdef _WIN32 b3Assert(_finite(linImp0.getX())); b3Assert(_finite(linImp1.getX())); #endif { linVelA += linImp0; angVelA += angImp0; linVelB += linImp1; angVelB += angImp1; } } } }
/// Returns the time in us since the last call to reset or since /// the Clock was created. unsigned long int b3Clock::getTimeMicroseconds() { #ifdef B3_USE_WINDOWS_TIMERS LARGE_INTEGER currentTime; QueryPerformanceCounter(¤tTime); LONGLONG elapsedTime = currentTime.QuadPart - m_data->mStartTime.QuadPart; // Compute the number of millisecond ticks elapsed. unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / m_data->mClockFrequency.QuadPart); // Check for unexpected leaps in the Win32 performance counter. // (This is caused by unexpected data across the PCI to ISA // bridge, aka south bridge. See Microsoft KB274323.) unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick; signed long msecOff = (signed long)(msecTicks - elapsedTicks); if (msecOff < -100 || msecOff > 100) { // Adjust the starting time forwards. LONGLONG msecAdjustment = b3Min(msecOff * m_data->mClockFrequency.QuadPart / 1000, elapsedTime - m_data->mPrevElapsedTime); m_data->mStartTime.QuadPart += msecAdjustment; elapsedTime -= msecAdjustment; } // Store the current elapsed time for adjustments next time. m_data->mPrevElapsedTime = elapsedTime; // Convert to microseconds. unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime / m_data->mClockFrequency.QuadPart); return usecTicks; #else #ifdef __CELLOS_LV2__ uint64_t freq=sys_time_get_timebase_frequency(); double dFreq=((double) freq)/ 1000000.0; typedef uint64_t ClockSize; ClockSize newTime; //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); SYS_TIMEBASE_GET( newTime ); return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq); #else struct timeval currentTime; gettimeofday(¤tTime, 0); return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + (currentTime.tv_usec - m_data->mStartTime.tv_usec); #endif//__CELLOS_LV2__ #endif }
bool radixSortTest() { TEST_INIT; int maxSize = 1024*256; b3AlignedObjectArray<b3SortData> buf0Host; buf0Host.resize(maxSize); b3AlignedObjectArray<b3SortData> buf1Host; buf1Host.resize(maxSize ); b3OpenCLArray<b3SortData> buf2CL(g_context,g_queue,maxSize); b3RadixSort32CL* sort = new b3RadixSort32CL(g_context,g_device,g_queue,maxSize); int dx = maxSize/NUM_TESTS; for(int iter=0; iter<NUM_TESTS; iter++) { int size = b3Min( 128+dx*iter, maxSize-512 ); size = NEXTMULTIPLEOF( size, 512 );//not necessary buf0Host.resize(size); for(int i=0; i<size; i++) { b3SortData v; v.m_key = getRandom(0,0xff); v.m_value = i; buf0Host[i] = v; } buf2CL.copyFromHost( buf0Host); sort->executeHost( buf0Host); sort->execute(buf2CL); buf2CL.copyToHost(buf1Host); for(int i=0; i<size; i++) { TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key ); } } delete sort; TEST_REPORT( "radixSort" ); return g_testFailed; }
void solveContact3(b3GpuConstraint4* cs, b3Vector3* posAPtr, b3Vector3* linVelA, b3Vector3* angVelA, float invMassA, const b3Matrix3x3& invInertiaA, b3Vector3* posBPtr, b3Vector3* linVelB, b3Vector3* angVelB, float invMassB, const b3Matrix3x3& invInertiaB, b3Vector3* dLinVelA, b3Vector3* dAngVelA, b3Vector3* dLinVelB, b3Vector3* dAngVelB) { float minRambdaDt = 0; float maxRambdaDt = FLT_MAX; for(int ic=0; ic<4; ic++) { if( cs->m_jacCoeffInv[ic] == 0.f ) continue; b3Vector3 angular0, angular1, linear; b3Vector3 r0 = cs->m_worldPos[ic] - *posAPtr; b3Vector3 r1 = cs->m_worldPos[ic] - *posBPtr; setLinearAndAngular( cs->m_linear, r0, r1, linear, angular0, angular1 ); float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic]; rambdaDt *= cs->m_jacCoeffInv[ic]; { float prevSum = cs->m_appliedRambdaDt[ic]; float updated = prevSum; updated += rambdaDt; updated = b3Max( updated, minRambdaDt ); updated = b3Min( updated, maxRambdaDt ); rambdaDt = updated - prevSum; cs->m_appliedRambdaDt[ic] = updated; } b3Vector3 linImp0 = invMassA*linear*rambdaDt; b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; if (invMassA) { *dLinVelA += linImp0; *dAngVelA += angImp0; } if (invMassB) { *dLinVelB += linImp1; *dAngVelB += angImp1; } } }
void GLInstancingRenderer::drawPoints(const float* positions, const float color[4], int numPoints, int pointStrideInBytes, float pointDrawSize) { glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D,0); b3Assert(glGetError() ==GL_NO_ERROR); glUseProgram(linesShader); glUniformMatrix4fv(lines_ProjectionMatrix, 1, false, &m_data->m_projectionMatrix[0]); glUniformMatrix4fv(lines_ModelViewMatrix, 1, false, &m_data->m_viewMatrix[0]); glUniform4f(lines_colour,color[0],color[1],color[2],color[3]); glPointSize(pointDrawSize); glBindVertexArray(lineVertexArrayObject); glBindBuffer(GL_ARRAY_BUFFER, lineVertexBufferObject); int maxPointsInBatch = MAX_POINTS_IN_BATCH; int remainingPoints = numPoints; int offsetNumPoints= 0; while (1) { int curPointsInBatch = b3Min(maxPointsInBatch, remainingPoints); if (curPointsInBatch) { glBufferSubData(GL_ARRAY_BUFFER, 0, curPointsInBatch*pointStrideInBytes, positions + offsetNumPoints*(pointStrideInBytes / sizeof(float))); glEnableVertexAttribArray(0); int numFloats = 3;// pointStrideInBytes / sizeof(float); glVertexAttribPointer(0, numFloats, GL_FLOAT, GL_FALSE, pointStrideInBytes, 0); glDrawArrays(GL_POINTS, 0, curPointsInBatch); remainingPoints -= curPointsInBatch; offsetNumPoints += curPointsInBatch; } else { break; } } glBindVertexArray(0); glPointSize(1); glUseProgram(0); }
void prefixScanTest() { TEST_INIT; int maxSize = 1024*256; b3AlignedObjectArray<unsigned int> buf0Host; b3AlignedObjectArray<unsigned int> buf1Host; b3OpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize); b3OpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize); b3PrefixScanCL* scan = new b3PrefixScanCL(g_context,g_device,g_queue,maxSize); int dx = maxSize/NUM_TESTS; for(int iter=0; iter<NUM_TESTS; iter++) { int size = b3Min( 128+dx*iter, maxSize ); buf0Host.resize(size); buf1Host.resize(size); for(int i=0; i<size; i++) buf0Host[i] = 1; buf2CL.copyFromHost( buf0Host); unsigned int sumHost, sumGPU; scan->executeHost(buf0Host, buf1Host, size, &sumHost ); scan->execute( buf2CL, buf3CL, size, &sumGPU ); buf3CL.copyToHost(buf0Host); TEST_ASSERT( sumHost == sumGPU ); for(int i=0; i<size; i++) TEST_ASSERT( buf1Host[i] == buf0Host[i] ); } delete scan; TEST_REPORT( "scanTest" ); }
inline void fillIntTest() { TEST_INIT; b3FillCL* fillCL = new b3FillCL(g_context,g_device,g_queue); int maxSize=1024*256; b3OpenCLArray<int> intBuffer(g_context,g_queue,maxSize); intBuffer.resize(maxSize); #define NUM_TESTS 7 int dx = maxSize/NUM_TESTS; for (int iter=0;iter<NUM_TESTS;iter++) { int size = b3Min( 11+dx*iter, maxSize ); int value = 2; int offset=0; fillCL->execute(intBuffer,value,size,offset); b3AlignedObjectArray<int> hostBuf2; hostBuf2.resize(size); fillCL->executeHost(hostBuf2,value,size,offset); b3AlignedObjectArray<int> hostBuf; intBuffer.copyToHost(hostBuf); for(int i=0; i<size; i++) { TEST_ASSERT( hostBuf[i] == hostBuf2[i] ); TEST_ASSERT( hostBuf[i] == hostBuf2[i] ); } } delete fillCL; TEST_REPORT( "fillIntTest" ); }
static __inline void solveFriction(b3GpuConstraint4& cs, const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, float maxRambdaDt[4], float minRambdaDt[4]) { if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return; const b3Vector3& center = (const b3Vector3&)cs.m_center; b3Vector3 n = -(const b3Vector3&)cs.m_linear; b3Vector3 tangent[2]; #if 1 b3PlaneSpace1 (n, tangent[0],tangent[1]); #else b3Vector3 r = cs.m_worldPos[0]-center; tangent[0] = cross3( n, r ); tangent[1] = cross3( tangent[0], n ); tangent[0] = normalize3( tangent[0] ); tangent[1] = normalize3( tangent[1] ); #endif b3Vector3 angular0, angular1, linear; b3Vector3 r0 = center - posA; b3Vector3 r1 = center - posB; for(int i=0; i<2; i++) { setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 ); float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, linVelA, angVelA, linVelB, angVelB ); rambdaDt *= cs.m_fJacCoeffInv[i]; { float prevSum = cs.m_fAppliedRambdaDt[i]; float updated = prevSum; updated += rambdaDt; updated = b3Max( updated, minRambdaDt[i] ); updated = b3Min( updated, maxRambdaDt[i] ); rambdaDt = updated - prevSum; cs.m_fAppliedRambdaDt[i] = updated; } b3Vector3 linImp0 = invMassA*linear*rambdaDt; b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; #ifdef _WIN32 b3Assert(_finite(linImp0.getX())); b3Assert(_finite(linImp1.getX())); #endif linVelA += linImp0; angVelA += angImp0; linVelB += linImp1; angVelB += angImp1; } { // angular damping for point constraint b3Vector3 ab = ( posB - posA ).normalized(); b3Vector3 ac = ( center - posA ).normalized(); if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) { float angNA = b3Dot( n, angVelA ); float angNB = b3Dot( n, angVelB ); angVelA -= (angNA*0.1f)*n; angVelB -= (angNB*0.1f)*n; } } }
void b3DynamicBvhBroadphase::collide(b3Dispatcher* dispatcher) { /*printf("---------------------------------------------------------\n"); printf("m_sets[0].m_leaves=%d\n",m_sets[0].m_leaves); printf("m_sets[1].m_leaves=%d\n",m_sets[1].m_leaves); printf("numPairs = %d\n",getOverlappingPairCache()->getNumOverlappingPairs()); { int i; for (i=0;i<getOverlappingPairCache()->getNumOverlappingPairs();i++) { printf("pair[%d]=(%d,%d),",i,getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy0->getUid(), getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy1->getUid()); } printf("\n"); } */ b3SPC(m_profiling.m_total); /* optimize */ m_sets[0].optimizeIncremental(1+(m_sets[0].m_leaves*m_dupdates)/100); if(m_fixedleft) { const int count=1+(m_sets[1].m_leaves*m_fupdates)/100; m_sets[1].optimizeIncremental(1+(m_sets[1].m_leaves*m_fupdates)/100); m_fixedleft=b3Max<int>(0,m_fixedleft-count); } /* dynamic -> fixed set */ m_stageCurrent=(m_stageCurrent+1)%STAGECOUNT; b3DbvtProxy* current=m_stageRoots[m_stageCurrent]; if(current) { b3DbvtTreeCollider collider(this); do { b3DbvtProxy* next=current->links[1]; b3ListRemove(current,m_stageRoots[current->stage]); b3ListAppend(current,m_stageRoots[STAGECOUNT]); #if B3_DBVT_BP_ACCURATESLEEPING m_paircache->removeOverlappingPairsContainingProxy(current,dispatcher); collider.proxy=current; b3DynamicBvh::collideTV(m_sets[0].m_root,current->aabb,collider); b3DynamicBvh::collideTV(m_sets[1].m_root,current->aabb,collider); #endif m_sets[0].remove(current->leaf); B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume) curAabb=b3DbvtVolume::FromMM(current->m_aabbMin,current->m_aabbMax); current->leaf = m_sets[1].insert(curAabb,current); current->stage = STAGECOUNT; current = next; } while(current); m_fixedleft=m_sets[1].m_leaves; m_needcleanup=true; } /* collide dynamics */ { b3DbvtTreeCollider collider(this); if(m_deferedcollide) { b3SPC(m_profiling.m_fdcollide); m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[1].m_root,collider); } if(m_deferedcollide) { b3SPC(m_profiling.m_ddcollide); m_sets[0].collideTTpersistentStack(m_sets[0].m_root,m_sets[0].m_root,collider); } } /* clean up */ if(m_needcleanup) { b3SPC(m_profiling.m_cleanup); b3BroadphasePairArray& pairs=m_paircache->getOverlappingPairArray(); if(pairs.size()>0) { int ni=b3Min(pairs.size(),b3Max<int>(m_newpairs,(pairs.size()*m_cupdates)/100)); for(int i=0;i<ni;++i) { b3BroadphasePair& p=pairs[(m_cid+i)%pairs.size()]; b3DbvtProxy* pa=&m_proxies[p.x]; b3DbvtProxy* pb=&m_proxies[p.y]; if(!b3Intersect(pa->leaf->volume,pb->leaf->volume)) { #if B3_DBVT_BP_SORTPAIRS if(pa->m_uniqueId>pb->m_uniqueId) b3Swap(pa,pb); #endif m_paircache->removeOverlappingPair(pa->getUid(),pb->getUid(),dispatcher); --ni;--i; } } if(pairs.size()>0) m_cid=(m_cid+ni)%pairs.size(); else m_cid=0; } } ++m_pid; m_newpairs=1; m_needcleanup=false; if(m_updates_call>0) { m_updates_ratio=m_updates_done/(b3Scalar)m_updates_call; } else { m_updates_ratio=0; } m_updates_done/=2; m_updates_call/=2; }
void b3Island::Solve(const b3Vec3& gravityDir) { r32 h = dt; b3Vec3 gravityForce = B3_GRAVITY_ACC * gravityDir; // Integrate velocities. for (u32 i = 0; i < bodyCount; ++i) { b3Body* b = bodies[i]; b3Vec3 v = b->m_linearVelocity; b3Vec3 w = b->m_angularVelocity; b3Vec3 x = b->m_worldCenter; b3Quaternion q = b->m_orientation; if (b->m_type == e_dynamicBody) { // Use semi-implitic Euler. b3Vec3 force = b->m_gravityScale * gravityForce + b->m_force; v += (h * b->m_invMass) * force; w += h * (b->m_invWorldInertia * b->m_torque); // References: Box2D. // Apply damping. // ODE: dv/dt + c * v = 0 // Solution: v(t) = v0 * exp(-c * t) // Time step: v(t + dt) = v0 * exp(-c * (t + dt)) = v0 * exp(-c * t) * exp(-c * dt) = v * exp(-c * dt) // v2 = exp(-c * dt) * v1 // Pade approximation: // v2 = v1 * 1 / (1 + c * dt) v *= B3_ONE / (B3_ONE + h * r32(0.1)); w *= B3_ONE / (B3_ONE + h * r32(0.1)); } velocities[i].v = v; velocities[i].w = w; positions[i].x = x; positions[i].q = q; } b3JointSolverDef jointSolverDef; jointSolverDef.dt = h; jointSolverDef.joints = joints; jointSolverDef.count = jointCount; jointSolverDef.positions = positions; jointSolverDef.velocities = velocities; b3JointSolver jointSolver(&jointSolverDef); jointSolver.InitializeVelocityConstraints(); b3ContactSolverDef contactSolverDef; contactSolverDef.dt = h; contactSolverDef.contacts = contacts; contactSolverDef.count = contactCount; contactSolverDef.positions = positions; contactSolverDef.velocities = velocities; contactSolverDef.allocator = allocator; b3ContactSolver contactSolver(&contactSolverDef); contactSolver.InitializeVelocityConstraints(); jointSolver.WarmStart(); contactSolver.WarmStart(); // Solve velocity constraints. for (u32 i = 0; i < velocityIterations; ++i) { jointSolver.SolveVelocityConstraints(); contactSolver.SolveVelocityConstraints(); } contactSolver.StoreImpulses(); for (u32 i = 0; i < bodyCount; ++i) { b3Body* b = bodies[i]; if (b->m_type == e_staticBody) { continue; } b3Vec3 x = positions[i].x; b3Quaternion q1 = positions[i].q; b3Vec3 v = velocities[i].v; b3Vec3 w = velocities[i].w; x += h * v; b3Quaternion q2 = Integrate(q1, w, h); positions[i].x = x; positions[i].q = q2; velocities[i].v = v; velocities[i].w = w; } for (u32 i = 0; i < bodyCount; ++i) { b3Body* b = bodies[i]; if (b->m_type == e_staticBody) { continue; } b->m_worldCenter = positions[i].x; b->m_orientation = positions[i].q; b->m_linearVelocity = velocities[i].v; b->m_angularVelocity = velocities[i].w; } if (allowSleep) { r32 minSleepTime = B3_MAX_FLOAT; for (u32 i = 0; i < bodyCount; ++i) { b3Body* b = bodies[i]; if (b->m_type == e_staticBody) { continue; } // Compute the linear and angular speed of the body. const r32 sqrLinVel = b3LenSq(b->m_linearVelocity); const r32 sqrAngVel = b3LenSq(b->m_angularVelocity); if (sqrLinVel > B3_SLEEP_LINEAR_TOL || sqrAngVel > B3_SLEEP_ANGULAR_TOL) { minSleepTime = B3_ZERO; b->m_sleepTime = B3_ZERO; } else { b->m_sleepTime += h; minSleepTime = b3Min(minSleepTime, b->m_sleepTime); } } // Put the island to sleep so long as the minimum found sleep time // is below the threshold. if (minSleepTime >= B3_TIME_TO_SLEEP) { for (u32 i = 0; i < bodyCount; ++i) { bodies[i]->SetAwake(false); } } } }
void boundSearchTest( ) { TEST_INIT; int maxSize = 1024*256; int bucketSize = 256; b3OpenCLArray<b3SortData> srcCL(g_context,g_queue,maxSize); b3OpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize); b3OpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize); b3AlignedObjectArray<b3SortData> srcHost; b3AlignedObjectArray<unsigned int> upperHost; b3AlignedObjectArray<unsigned int> lowerHost; b3AlignedObjectArray<unsigned int> upperHostCompare; b3AlignedObjectArray<unsigned int> lowerHostCompare; b3BoundSearchCL* search = new b3BoundSearchCL(g_context,g_device,g_queue, maxSize); int dx = maxSize/NUM_TESTS; for(int iter=0; iter<NUM_TESTS; iter++) { int size = b3Min( 128+dx*iter, maxSize ); upperHost.resize(bucketSize); lowerHost.resize(bucketSize); upperHostCompare.resize(bucketSize); lowerHostCompare.resize(bucketSize); srcHost.resize(size); for(int i=0; i<size; i++) { b3SortData v; // v.m_key = i<2? 0 : 5; v.m_key = getRandom(0,bucketSize); v.m_value = i; srcHost.at(i) = v; } srcHost.quickSort(b3SortDataCompare()); srcCL.copyFromHost(srcHost); { for(int i=0; i<bucketSize; i++) { lowerHost[i] = -1; lowerHostCompare[i] = -1; upperHost[i] = -1; upperHostCompare[i] = -1; } upperCL.copyFromHost(upperHost); lowerCL.copyFromHost(lowerHost); } search->execute(srcCL,size,upperCL,bucketSize,b3BoundSearchCL::BOUND_UPPER); search->execute(srcCL,size,lowerCL,bucketSize,b3BoundSearchCL::BOUND_LOWER); search->executeHost(srcHost,size,upperHostCompare,bucketSize,b3BoundSearchCL::BOUND_UPPER); search->executeHost(srcHost,size,lowerHostCompare,bucketSize,b3BoundSearchCL::BOUND_LOWER); lowerCL.copyToHost(lowerHost); upperCL.copyToHost(upperHost); for(int i=0; i<bucketSize; i++) { TEST_ASSERT(upperHostCompare[i] == upperHost[i]); TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]); } /* for(int i=1; i<bucketSize; i++) { int lhi_1 = lowerHost[i-1]; int lhi = lowerHost[i]; for(int j=lhi_1; j<lhi; j++) //for(int j=lowerHost[i-1]; j<lowerHost[i]; j++) { TEST_ASSERT( srcHost[j].m_key < i ); } } for(int i=0; i<bucketSize; i++) { int jMin = (i==0)?0:upperHost[i-1]; for(int j=jMin; j<upperHost[i]; j++) { TEST_ASSERT( srcHost[j].m_key <= i ); } } */ for(int i=0; i<bucketSize; i++) { int lhi = lowerHost[i]; int uhi = upperHost[i]; for(int j=lhi; j<uhi; j++) { if ( srcHost[j].m_key != i ) { printf("error %d != %d\n",srcHost[j].m_key,i); } TEST_ASSERT( srcHost[j].m_key == i ); } } } delete search; TEST_REPORT( "boundSearchTest" ); }
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) { int originalSize = keyValuesInOut.size(); int workingSize = originalSize; int dataAlignment = DATA_ALIGNMENT; #ifdef DEBUG_RADIXSORT2 b3AlignedObjectArray<b3SortData> test2; keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } #endif //DEBUG_RADIXSORT2 b3OpenCLArray<b3SortData>* src = 0; if (workingSize%dataAlignment) { workingSize += dataAlignment-(workingSize%dataAlignment); m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); m_workBuffer4->resize(workingSize); b3SortData fillValue; fillValue.m_key = 0xffffffff; fillValue.m_value = 0xffffffff; #define USE_BTFILL #ifdef USE_BTFILL m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize); #else //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) for (int i=originalSize; i<workingSize;i++) { m_workBuffer4->copyFromHostPointer(&fillValue,1,i); } #endif//USE_BTFILL src = m_workBuffer4; } else { src = &keyValuesInOut; m_workBuffer4->resize(0); } b3Assert( workingSize%DATA_ALIGNMENT == 0 ); int minCap = NUM_BUCKET*NUM_WGS; int n = workingSize; m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); b3Assert( BITS_PER_PASS == 4 ); b3Assert( WG_SIZE == 64 ); b3Assert( (sortBits&0x3) == 0 ); b3OpenCLArray<b3SortData>* dst = m_workBuffer3; b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; int nWGs = NUM_WGS; b3ConstData cdata; { int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 int nBlocks = (n+blockSize-1)/(blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; if( nBlocks < NUM_WGS ) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } int count=0; for(int ib=0; ib<sortBits; ib+=4) { #ifdef DEBUG_RADIXSORT2 keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { if (test2[i].m_key != test2[i].m_value) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } } #endif //DEBUG_RADIXSORT2 cdata.m_startBit = ib; if (src->size()) { b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); int num = NUM_WGS*WG_SIZE; launcher.launch1D( num, WG_SIZE ); } #ifdef DEBUG_RADIXSORT b3AlignedObjectArray<unsigned int> testHist; srcHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } #endif //DEBUG_RADIXSORT //fast prefix scan is not working properly on Mac OSX yet #ifdef _WIN32 bool fastScan=!m_deviceCPU;//only use fast scan on GPU #else bool fastScan=false; #endif if (fastScan) {// prefix scan group histogram b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( 128, 128 ); destHisto = srcHisto; }else { //unsigned int sum; //for debugging m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); } #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } for (int i=0;i<testHist.size();i+=NUM_WGS) { printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); } #endif //DEBUG_RADIXSORT #define USE_GPU #ifdef USE_GPU if (src->size()) {// local sort and distribute b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); } #else { #define NUM_TABLES 16 //#define SEQUENTIAL #ifdef SEQUENTIAL int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; int startBit = ib; destHisto->copyToHost(testHist); b3AlignedObjectArray<b3SortData> srcHost; b3AlignedObjectArray<b3SortData> dstHost; dstHost.resize(src->size()); src->copyToHost(srcHost); for (int i=0;i<NUM_TABLES;i++) { tables[i] = testHist[i*NUM_WGS]; } // distribute for(int i=0; i<n; i++) { int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } #else int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; b3AlignedObjectArray<b3SortData> dstHostOK; dstHostOK.resize(src->size()); destHisto->copyToHost(testHist); b3AlignedObjectArray<b3SortData> srcHost; src->copyToHost(srcHost); int blockSize = 256; int nBlocksPerWG = cdata.m_nBlocksPerWG; int startBit = ib; { for (int i=0;i<NUM_TABLES;i++) { tables[i] = testHist[i*NUM_WGS]; } // distribute for(int i=0; i<n; i++) { int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } } b3AlignedObjectArray<b3SortData> dstHost; dstHost.resize(src->size()); int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) { int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) { for (int lIdx = 0;lIdx < 64;lIdx++) { int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops // AMD: AtomInc performs better while NV prefers ++ for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) { if( addr+j < n ) { // printf ("addr+j=%d\n", addr+j); int i = addr+j; int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; b3SortData ok = dstHostOK[destIndex]; if (ok.m_key != srcHost[i].m_key) { printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); } if (ok.m_value != srcHost[i].m_value) { printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); } dstHost[destIndex] = srcHost[i]; counter[tableIdx] ++; } } } } } #endif //SEQUENTIAL dst->copyFromHost(dstHost); } #endif//USE_GPU #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;i<testHist.size();i++) { if (testHist[i]!=0) printf("testHist[%d]=%d\n",i,testHist[i]); } #endif //DEBUG_RADIXSORT b3Swap(src, dst ); b3Swap(srcHisto,destHisto); #ifdef DEBUG_RADIXSORT2 keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i<test2.size();i++) { if (test2[i].m_key != test2[i].m_value) { printf("test2[%d].m_key=%d\n",i,test2[i].m_key); printf("test2[%d].m_value=%d\n",i,test2[i].m_value); } } #endif //DEBUG_RADIXSORT2 count++; }
void b3Island::Solve(const b3Vec3& gravity, float32 dt, u32 velocityIterations, u32 positionIterations, u32 flags) { float32 h = dt; // 1. Integrate velocities for (u32 i = 0; i < m_bodyCount; ++i) { b3Body* b = m_bodies[i]; b3Vec3 v = b->m_linearVelocity; b3Vec3 w = b->m_angularVelocity; b3Vec3 x = b->m_sweep.worldCenter; b3Quat q = b->m_sweep.orientation; // Remember the positions for CCD b->m_sweep.worldCenter0 = b->m_sweep.worldCenter; b->m_sweep.orientation0 = b->m_sweep.orientation; if (b->m_type == e_dynamicBody) { // Integrate forces v += h * (b->m_gravityScale * gravity + b->m_invMass * b->m_force); // Clear forces b->m_force.SetZero(); // Integrate torques // Superposition Principle // w2 - w1 = dw1 + dw2 // w2 - w1 = h * I^1 * bt + h * I^1 * -gt // w2 = w1 + dw1 + dw2 // Explicit Euler on current inertia and applied torque // w2 = w1 + h * I1^1 * bt1 b3Vec3 dw1 = h * b->m_worldInvI * b->m_torque; // Implicit Euler on next inertia and angular velocity // w2 = w1 - h * I2^1 * cross(w2, I2 * w2) // w2 - w1 = -I2^1 * h * cross(w2, I2 * w2) // I2 * (w2 - w1) = -h * cross(w2, I2 * w2) // I2 * (w2 - w1) + h * cross(w2, I2 * w2) = 0 // Toss out I2 from f using local I2 (constant) and local w1 // to remove its time dependency. b3Vec3 w2 = b3SolveGyro(q, b->m_I, w, h); b3Vec3 dw2 = w2 - w; w += dw1 + dw2; // Clear torques b->m_torque.SetZero(); // Apply local damping. // ODE: dv/dt + c * v = 0 // Solution: v(t) = v0 * exp(-c * t) // Step: v(t + dt) = v0 * exp(-c * (t + dt)) = v0 * exp(-c * t) * exp(-c * dt) = v * exp(-c * dt) // v2 = exp(-c * dt) * v1 // Padé approximation: // 1 / (1 + c * dt) v *= 1.0f / (1.0f + h * b->m_linearDamping); w *= 1.0f / (1.0f + h * b->m_angularDamping); } m_velocities[i].v = v; m_velocities[i].w = w; m_positions[i].x = x; m_positions[i].q = q; m_invInertias[i] = b->m_worldInvI; } b3JointSolverDef jointSolverDef; jointSolverDef.joints = m_joints; jointSolverDef.count = m_jointCount; jointSolverDef.positions = m_positions; jointSolverDef.velocities = m_velocities; jointSolverDef.invInertias = m_invInertias; jointSolverDef.dt = h; b3JointSolver jointSolver(&jointSolverDef); b3ContactSolverDef contactSolverDef; contactSolverDef.allocator = m_allocator; contactSolverDef.contacts = m_contacts; contactSolverDef.count = m_contactCount; contactSolverDef.positions = m_positions; contactSolverDef.velocities = m_velocities; contactSolverDef.invInertias = m_invInertias; contactSolverDef.dt = h; b3ContactSolver contactSolver(&contactSolverDef); // 2. Initialize constraints { B3_PROFILE("Initialize Constraints"); contactSolver.InitializeConstraints(); if (flags & e_warmStartBit) { contactSolver.WarmStart(); } jointSolver.InitializeConstraints(); if (flags & e_warmStartBit) { jointSolver.WarmStart(); } } // 3. Solve velocity constraints { B3_PROFILE("Solve Velocity Constraints"); for (u32 i = 0; i < velocityIterations; ++i) { jointSolver.SolveVelocityConstraints(); contactSolver.SolveVelocityConstraints(); } if (flags & e_warmStartBit) { contactSolver.StoreImpulses(); } } // 4. Integrate positions for (u32 i = 0; i < m_bodyCount; ++i) { b3Body* b = m_bodies[i]; b3Vec3 x = m_positions[i].x; b3Quat q = m_positions[i].q; b3Vec3 v = m_velocities[i].v; b3Vec3 w = m_velocities[i].w; b3Mat33 invI = m_invInertias[i]; // Prevent numerical instability due to large velocity changes. b3Vec3 translation = h * v; if (b3Dot(translation, translation) > B3_MAX_TRANSLATION_SQUARED) { float32 ratio = B3_MAX_TRANSLATION / b3Length(translation); v *= ratio; } b3Vec3 rotation = h * w; if (b3Dot(rotation, rotation) > B3_MAX_ROTATION_SQUARED) { float32 ratio = B3_MAX_ROTATION / b3Length(rotation); w *= ratio; } // Integrate x += h * v; q = b3Integrate(q, w, h); invI = b3RotateToFrame(b->m_invI, q); m_positions[i].x = x; m_positions[i].q = q; m_velocities[i].v = v; m_velocities[i].w = w; m_invInertias[i] = invI; } // 5. Solve position constraints { B3_PROFILE("Solve Position Constraints"); bool positionsSolved = false; for (u32 i = 0; i < positionIterations; ++i) { bool contactsSolved = contactSolver.SolvePositionConstraints(); bool jointsSolved = jointSolver.SolvePositionConstraints(); if (contactsSolved && jointsSolved) { // Early out if the position errors are small. positionsSolved = true; break; } } } // 6. Copy state buffers back to the bodies for (u32 i = 0; i < m_bodyCount; ++i) { b3Body* b = m_bodies[i]; b->m_sweep.worldCenter = m_positions[i].x; b->m_sweep.orientation = m_positions[i].q; b->m_sweep.orientation.Normalize(); b->m_linearVelocity = m_velocities[i].v; b->m_angularVelocity = m_velocities[i].w; b->m_worldInvI = m_invInertias[i]; b->SynchronizeTransform(); } // 7. Put bodies under unconsiderable motion to sleep if (flags & e_sleepBit) { float32 minSleepTime = B3_MAX_FLOAT; for (u32 i = 0; i < m_bodyCount; ++i) { b3Body* b = m_bodies[i]; if (b->m_type == e_staticBody) { continue; } // Compute the linear and angular speed of the body. float32 sqrLinVel = b3Dot(b->m_linearVelocity, b->m_linearVelocity); float32 sqrAngVel = b3Dot(b->m_angularVelocity, b->m_angularVelocity); if (sqrLinVel > B3_SLEEP_LINEAR_TOL || sqrAngVel > B3_SLEEP_ANGULAR_TOL) { minSleepTime = 0.0f; b->m_sleepTime = 0.0f; } else { b->m_sleepTime += h; minSleepTime = b3Min(minSleepTime, b->m_sleepTime); } } // Put the island to sleep so long as the minimum found sleep time // is below the threshold. if (minSleepTime >= B3_TIME_TO_SLEEP) { for (u32 i = 0; i < m_bodyCount; ++i) { m_bodies[i]->SetAwake(false); } } } }