/* ============ SSE3_Dot ============ */ float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) { float d; __asm { mov esi, v1 mov edi, v2 movaps xmm0, [esi] mulps xmm0, [edi] haddps( _xmm0, _xmm0 ) haddps( _xmm0, _xmm0 ) movss d, xmm0 } return d; }
int main(){ typedef union{ __m128 m128; float flt[4]; } m128f; __m128 x = {1.0,2.0,3.0,4.0}; __m128 y = {10.0,20.0,30.0,40.0}; m128f s,h; s.m128=haddps(x,y); h.m128=_mm_hadd_ps(x,y); printf("Software hadd: %f %f %f %f\n",s.flt[0],s.flt[1],s.flt[2],s.flt[3]); printf("Hardware hadd: %f %f %f %f\n",h.flt[0],h.flt[1],h.flt[2],h.flt[3]); return; }
/* ============ idSIMD_SSE3::TransformVerts ============ */ void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) { #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, index mov esi, weights mov edi, joints add ecx, eax neg eax loopVert: mov ebx, [edx] movaps xmm2, [esi] add edx, 8 movaps xmm0, xmm2 add esi, JOINTWEIGHT_SIZE movaps xmm1, xmm2 mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 cmp dword ptr [edx-4], 0 jne doneWeight loopWeight: mov ebx, [edx] movaps xmm5, [esi] add edx, 8 movaps xmm3, xmm5 add esi, JOINTWEIGHT_SIZE movaps xmm4, xmm5 mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2 cmp dword ptr [edx-4], 0 addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 je loopWeight doneWeight: add eax, DRAWVERT_SIZE haddps( _xmm0, _xmm1 ) haddps( _xmm2, _xmm0 ) movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2 haddps( _xmm2, _xmm2 ) movss [ecx+eax-DRAWVERT_SIZE+8], xmm2 jl loopVert done: } #else int i, j; const byte *jointsPtr = (byte *)joints; for( j = i = 0; i < numVerts; i++ ) { idVec3 v; v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j]; while( index[j*2+1] == 0 ) { j++; v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j]; } j++; verts[i].xyz = v; } #endif }