// *************************************************************************** void CMeshMRMGeom::applyArrayRawSkinNormal1(CRawVertexNormalSkin1 *src, uint8 *destVertexPtr, CMatrix3x4 *boneMat3x4, uint nInf) { // must write contigously in AGP, and ASM is hardcoded... nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12); nlctassert(NL3D_RAWSKIN_UV_OFF==24); /*extern uint TESTYOYO_NumRawSkinVertices1; TESTYOYO_NumRawSkinVertices1+= nInf; H_AUTO( TestYoyo_RawSkin1 );*/ #ifdef NL3D_RAWSKIN_PRECACHE for(;nInf>0;) { // number of vertices to process for this block. uint nBlockInf= min(NumCacheVertexNormal1, nInf); // next block. nInf-= nBlockInf; // cache the data in L1 cache. CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkin1)); #else { uint nBlockInf= nInf; #endif #ifndef NL3D_RAWSKIN_ASM // for all InfluencedVertices only. for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE) { CVector *dstVertex= (CVector*)(destVertexPtr); CVector *dstNormal= (CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF); // For 1 matrix, can write directly to AGP (if destVertexPtr is AGP...) // Vertex. boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, *(CVector*)(destVertexPtr) ); // Normal. boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) ); // UV copy. *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; } #else // ASM harcoded for 36 nlctassert(sizeof(CRawVertexNormalSkin1)==36); /* 116 cycles / loop typical 58 cycles / loop in theory (no memory problem) */ __asm { mov ecx, nBlockInf mov esi, src mov edi, destVertexPtr mov edx, boneMat3x4 theLoop: // Vertex. // **** boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, *(CVector*)(destVertexPtr) ); // eax= matrix mov eax, [esi]src.MatrixId // uop: 0/1 lea eax, [eax*2+eax] shl eax, 4 add eax, edx // uop: 1/0 // load x y z fld [esi]src.Vertex.Pos.x // uop: 0/1 fld [esi]src.Vertex.Pos.y // uop: 0/1 fld [esi]src.Vertex.Pos.z // uop: 0/1 // vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14); fld [eax]CMatrix3x4.a11 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) fld [eax]CMatrix3x4.a12 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a13 // uop: 0/1 fmul st, st(2) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a14 // uop: 0/1 faddp st(1), st // uop: 1/0 (3) fstp dword ptr[edi] // uop: 0/0/1/1 // vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24); fld [eax]CMatrix3x4.a21 fmul st, st(3) fld [eax]CMatrix3x4.a22 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a23 fmul st, st(2) faddp st(1), st fld [eax]CMatrix3x4.a24 faddp st(1), st fstp dword ptr[edi+4] // vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34); fld [eax]CMatrix3x4.a31 fmul st, st(3) fld [eax]CMatrix3x4.a32 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a33 fmul st, st(2) faddp st(1), st fld [eax]CMatrix3x4.a34 faddp st(1), st fstp dword ptr[edi+8] // free x y z fstp st // uop: 1/0 fstp st // uop: 1/0 fstp st // uop: 1/0 // Normal // **** boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) ); // load x y z fld [esi]src.Vertex.Normal.x fld [esi]src.Vertex.Normal.y fld [esi]src.Vertex.Normal.z // vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14); fld [eax]CMatrix3x4.a11 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) fld [eax]CMatrix3x4.a12 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a13 // uop: 0/1 fmul st, st(2) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fstp dword ptr[edi+12] // uop: 0/0/1/1 // vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24); fld [eax]CMatrix3x4.a21 fmul st, st(3) fld [eax]CMatrix3x4.a22 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a23 fmul st, st(2) faddp st(1), st fstp dword ptr[edi+16] // vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34); fld [eax]CMatrix3x4.a31 fmul st, st(3) fld [eax]CMatrix3x4.a32 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a33 fmul st, st(2) faddp st(1), st fstp dword ptr[edi+20] // free x y z fstp st fstp st fstp st // UV copy. // **** *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; mov eax, [esi]src.Vertex.UV.U // uop: 0/1 mov dword ptr[edi+24], eax // uop: 0/0/1/1 mov eax, [esi]src.Vertex.UV.V // uop: 0/1 mov dword ptr[edi+28], eax // uop: 0/0/1/1 // **** next add esi, 36 // uop: 1/0 add edi, NL3D_RAWSKIN_VERTEX_SIZE // uop: 1/0 dec ecx // uop: 1/0 jnz theLoop // uop: 1/1 (p1) mov nBlockInf, ecx mov src, esi mov destVertexPtr, edi } #endif } } // *************************************************************************** void CMeshMRMGeom::applyArrayRawSkinNormal2(CRawVertexNormalSkin2 *src, uint8 *destVertexPtr, CMatrix3x4 *boneMat3x4, uint nInf) { // must write contigously in AGP, and ASM is hardcoded... nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12); nlctassert(NL3D_RAWSKIN_UV_OFF==24); /*extern uint TESTYOYO_NumRawSkinVertices2; TESTYOYO_NumRawSkinVertices2+= nInf; H_AUTO( TestYoyo_RawSkin2 );*/ // Since VertexPtr may be a AGP Ram, MUST NOT read into it! (mulAdd*() do it!) CVector tmpVert; #ifdef NL3D_RAWSKIN_PRECACHE for(;nInf>0;) { // number of vertices to process for this block. uint nBlockInf= min(NumCacheVertexNormal2, nInf); // next block. nInf-= nBlockInf; // cache the data in L1 cache. CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkin2)); #else { uint nBlockInf= nInf; #endif #ifndef NL3D_RAWSKIN_ASM // for all InfluencedVertices only. for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE) { // Vertex. boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, src->Weights[0], tmpVert); boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex.Pos, src->Weights[1], tmpVert); *(CVector*)(destVertexPtr)= tmpVert; // Normal. boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, src->Weights[0], tmpVert); boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Vertex.Normal, src->Weights[1], tmpVert); *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert; // UV copy. *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; } #else // ASM harcoded for 48 nlctassert(sizeof(CRawVertexNormalSkin2)==48); /* 154 cycles / loop typical 124 cycles / loop in theory (no memory problem) */ __asm { mov ecx, nBlockInf mov esi, src mov edi, destVertexPtr mov edx, boneMat3x4 theLoop: // Vertex. // **** boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, *(CVector*)(destVertexPtr) ); // eax= matrix0 mov eax, [esi+0]src.MatrixId // uop: 0/1 lea eax, [eax*2+eax] shl eax, 4 add eax, edx // uop: 1/0 // ebx= matrix1 mov ebx, [esi+4]src.MatrixId // uop: 0/1 lea ebx, [ebx*2+ebx] shl ebx, 4 add ebx, edx // uop: 1/0 // load x y z fld [esi]src.Vertex.Pos.x // uop: 0/1 fld [esi]src.Vertex.Pos.y // uop: 0/1 fld [esi]src.Vertex.Pos.z // uop: 0/1 // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14); // 1st Matrix fld [eax]CMatrix3x4.a11 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) fld [eax]CMatrix3x4.a12 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a13 // uop: 0/1 fmul st, st(2) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a14 // uop: 0/1 faddp st(1), st // uop: 1/0 (3) // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a11 fmul st, st(4) fld [ebx]CMatrix3x4.a12 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a13 fmul st, st(3) faddp st(1), st fld [ebx]CMatrix3x4.a14 faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // store fstp dword ptr[edi] // uop: 0/0/1/1 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24); fld [eax]CMatrix3x4.a21 fmul st, st(3) fld [eax]CMatrix3x4.a22 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a23 fmul st, st(2) faddp st(1), st fld [eax]CMatrix3x4.a24 faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a21 fmul st, st(4) fld [ebx]CMatrix3x4.a22 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a23 fmul st, st(3) faddp st(1), st fld [ebx]CMatrix3x4.a24 faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // store fstp dword ptr[edi+4] // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34); fld [eax]CMatrix3x4.a31 fmul st, st(3) fld [eax]CMatrix3x4.a32 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a33 fmul st, st(2) faddp st(1), st fld [eax]CMatrix3x4.a34 faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a31 fmul st, st(4) fld [ebx]CMatrix3x4.a32 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a33 fmul st, st(3) faddp st(1), st fld [ebx]CMatrix3x4.a34 faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // store fstp dword ptr[edi+8] // free x y z fstp st // uop: 1/0 fstp st // uop: 1/0 fstp st // uop: 1/0 // Normal // **** boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) ); // load x y z fld [esi]src.Vertex.Normal.x fld [esi]src.Vertex.Normal.y fld [esi]src.Vertex.Normal.z // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14); fld [eax]CMatrix3x4.a11 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) fld [eax]CMatrix3x4.a12 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a13 // uop: 0/1 fmul st, st(2) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a11 fmul st, st(4) fld [ebx]CMatrix3x4.a12 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a13 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // store fstp dword ptr[edi+12] // uop: 0/0/1/1 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24); fld [eax]CMatrix3x4.a21 fmul st, st(3) fld [eax]CMatrix3x4.a22 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a23 fmul st, st(2) faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a21 fmul st, st(4) fld [ebx]CMatrix3x4.a22 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a23 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // store fstp dword ptr[edi+16] // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34); fld [eax]CMatrix3x4.a31 fmul st, st(3) fld [eax]CMatrix3x4.a32 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a33 fmul st, st(2) faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a31 fmul st, st(4) fld [ebx]CMatrix3x4.a32 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a33 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // store fstp dword ptr[edi+20] // free x y z fstp st fstp st fstp st // UV copy. // **** *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; mov eax, [esi]src.Vertex.UV.U // uop: 0/1 mov dword ptr[edi+24], eax // uop: 0/0/1/1 mov eax, [esi]src.Vertex.UV.V // uop: 0/1 mov dword ptr[edi+28], eax // uop: 0/0/1/1 // **** next add esi, 48 // uop: 1/0 add edi, NL3D_RAWSKIN_VERTEX_SIZE // uop: 1/0 dec ecx // uop: 1/0 jnz theLoop // uop: 1/1 (p1) mov nBlockInf, ecx mov src, esi mov destVertexPtr, edi } #endif } } // *************************************************************************** void CMeshMRMGeom::applyArrayRawSkinNormal3(CRawVertexNormalSkin3 *src, uint8 *destVertexPtr, CMatrix3x4 *boneMat3x4, uint nInf) { // must write contigously in AGP, and ASM is hardcoded... nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12); nlctassert(NL3D_RAWSKIN_UV_OFF==24); /*extern uint TESTYOYO_NumRawSkinVertices3; TESTYOYO_NumRawSkinVertices3+= nInf; H_AUTO( TestYoyo_RawSkin3 );*/ // Since VertexPtr may be a AGP Ram, MUST NOT read into it! (mulAdd*() do it!) CVector tmpVert; #ifdef NL3D_RAWSKIN_PRECACHE for(;nInf>0;) { // number of vertices to process for this block. uint nBlockInf= min(NumCacheVertexNormal3, nInf); // next block. nInf-= nBlockInf; // cache the data in L1 cache. CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkin3)); #else { uint nBlockInf= nInf; #endif #ifndef NL3D_RAWSKIN_ASM // for all InfluencedVertices only. for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE) { // Vertex. boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, src->Weights[0], tmpVert); boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex.Pos, src->Weights[1], tmpVert); boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex.Pos, src->Weights[2], tmpVert); *(CVector*)(destVertexPtr)= tmpVert; // Normal. boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, src->Weights[0], tmpVert); boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Vertex.Normal, src->Weights[1], tmpVert); boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Vertex.Normal, src->Weights[2], tmpVert); *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert; // UV copy. *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; } #else // ASM harcoded for 56 nlctassert(sizeof(CRawVertexNormalSkin3)==56); /* 226 cycles / loop typical 192 cycles / loop in theory (no memory problem) 148 optimal */ __asm { mov ecx, nBlockInf mov esi, src mov edi, destVertexPtr theLoop: // Vertex. // **** boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, *(CVector*)(destVertexPtr) ); // eax= matrix0 mov eax, [esi+0]src.MatrixId // uop: 0/1 lea eax, [eax*2+eax] shl eax, 4 add eax, boneMat3x4 // uop: 1/0 // ebx= matrix1 mov ebx, [esi+4]src.MatrixId // uop: 0/1 lea ebx, [ebx*2+ebx] shl ebx, 4 add ebx, boneMat3x4 // uop: 1/0 // edx= matrix2 mov edx, [esi+8]src.MatrixId // uop: 0/1 lea edx, [edx*2+edx] shl edx, 4 add edx, boneMat3x4 // uop: 1/0 // load x y z fld [esi]src.Vertex.Pos.x // uop: 0/1 fld [esi]src.Vertex.Pos.y // uop: 0/1 fld [esi]src.Vertex.Pos.z // uop: 0/1 // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14); // 1st Matrix fld [eax]CMatrix3x4.a11 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) fld [eax]CMatrix3x4.a12 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a13 // uop: 0/1 fmul st, st(2) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a14 // uop: 0/1 faddp st(1), st // uop: 1/0 (3) // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a11 fmul st, st(4) fld [ebx]CMatrix3x4.a12 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a13 fmul st, st(3) faddp st(1), st fld [ebx]CMatrix3x4.a14 faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // 3rd matrix fld [edx]CMatrix3x4.a11 fmul st, st(4) fld [edx]CMatrix3x4.a12 fmul st, st(4) faddp st(1), st fld [edx]CMatrix3x4.a13 fmul st, st(3) faddp st(1), st fld [edx]CMatrix3x4.a14 faddp st(1), st // mul by scale, and append fmul [esi+8]src.Weights faddp st(1), st // store fstp dword ptr[edi] // uop: 0/0/1/1 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24); fld [eax]CMatrix3x4.a21 fmul st, st(3) fld [eax]CMatrix3x4.a22 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a23 fmul st, st(2) faddp st(1), st fld [eax]CMatrix3x4.a24 faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a21 fmul st, st(4) fld [ebx]CMatrix3x4.a22 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a23 fmul st, st(3) faddp st(1), st fld [ebx]CMatrix3x4.a24 faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // 3rd matrix fld [edx]CMatrix3x4.a21 fmul st, st(4) fld [edx]CMatrix3x4.a22 fmul st, st(4) faddp st(1), st fld [edx]CMatrix3x4.a23 fmul st, st(3) faddp st(1), st fld [edx]CMatrix3x4.a24 faddp st(1), st // mul by scale, and append fmul [esi+8]src.Weights faddp st(1), st // store fstp dword ptr[edi+4] // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34); fld [eax]CMatrix3x4.a31 fmul st, st(3) fld [eax]CMatrix3x4.a32 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a33 fmul st, st(2) faddp st(1), st fld [eax]CMatrix3x4.a34 faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a31 fmul st, st(4) fld [ebx]CMatrix3x4.a32 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a33 fmul st, st(3) faddp st(1), st fld [ebx]CMatrix3x4.a34 faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // 3rd matrix fld [edx]CMatrix3x4.a31 fmul st, st(4) fld [edx]CMatrix3x4.a32 fmul st, st(4) faddp st(1), st fld [edx]CMatrix3x4.a33 fmul st, st(3) faddp st(1), st fld [edx]CMatrix3x4.a34 faddp st(1), st // mul by scale, and append fmul [esi+8]src.Weights faddp st(1), st // store fstp dword ptr[edi+8] // free x y z fstp st // uop: 1/0 fstp st // uop: 1/0 fstp st // uop: 1/0 // Normal // **** boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) ); // load x y z fld [esi]src.Vertex.Normal.x fld [esi]src.Vertex.Normal.y fld [esi]src.Vertex.Normal.z // **** vout.x= (a11*vin.x + a12*vin.y + a13*vin.z + a14); fld [eax]CMatrix3x4.a11 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) fld [eax]CMatrix3x4.a12 // uop: 0/1 fmul st, st(3) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) fld [eax]CMatrix3x4.a13 // uop: 0/1 fmul st, st(2) // uop: 1/0 (5) faddp st(1), st // uop: 1/0 (3) // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a11 fmul st, st(4) fld [ebx]CMatrix3x4.a12 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a13 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // 3rd matrix fld [edx]CMatrix3x4.a11 fmul st, st(4) fld [edx]CMatrix3x4.a12 fmul st, st(4) faddp st(1), st fld [edx]CMatrix3x4.a13 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+8]src.Weights faddp st(1), st // store fstp dword ptr[edi+12] // uop: 0/0/1/1 // **** vout.y= (a21*vin.x + a22*vin.y + a23*vin.z + a24); fld [eax]CMatrix3x4.a21 fmul st, st(3) fld [eax]CMatrix3x4.a22 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a23 fmul st, st(2) faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a21 fmul st, st(4) fld [ebx]CMatrix3x4.a22 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a23 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // 3rd matrix fld [edx]CMatrix3x4.a21 fmul st, st(4) fld [edx]CMatrix3x4.a22 fmul st, st(4) faddp st(1), st fld [edx]CMatrix3x4.a23 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+8]src.Weights faddp st(1), st // store fstp dword ptr[edi+16] // **** vout.z= (a31*vin.x + a32*vin.y + a33*vin.z + a34); fld [eax]CMatrix3x4.a31 fmul st, st(3) fld [eax]CMatrix3x4.a32 fmul st, st(3) faddp st(1), st fld [eax]CMatrix3x4.a33 fmul st, st(2) faddp st(1), st // mul by scale fmul [esi+0]src.Weights // 2nd matrix fld [ebx]CMatrix3x4.a31 fmul st, st(4) fld [ebx]CMatrix3x4.a32 fmul st, st(4) faddp st(1), st fld [ebx]CMatrix3x4.a33 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+4]src.Weights faddp st(1), st // 3rd matrix fld [edx]CMatrix3x4.a31 fmul st, st(4) fld [edx]CMatrix3x4.a32 fmul st, st(4) faddp st(1), st fld [edx]CMatrix3x4.a33 fmul st, st(3) faddp st(1), st // mul by scale, and append fmul [esi+8]src.Weights faddp st(1), st // store fstp dword ptr[edi+20] // free x y z fstp st fstp st fstp st // UV copy. // **** *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; mov eax, [esi]src.Vertex.UV.U // uop: 0/1 mov dword ptr[edi+24], eax // uop: 0/0/1/1 mov eax, [esi]src.Vertex.UV.V // uop: 0/1 mov dword ptr[edi+28], eax // uop: 0/0/1/1 // **** next add esi, 56 // uop: 1/0 add edi, NL3D_RAWSKIN_VERTEX_SIZE // uop: 1/0 dec ecx // uop: 1/0 jnz theLoop // uop: 1/1 (p1) mov nBlockInf, ecx mov src, esi mov destVertexPtr, edi } #endif } } // *************************************************************************** void CMeshMRMGeom::applyArrayRawSkinNormal4(CRawVertexNormalSkin4 *src, uint8 *destVertexPtr, CMatrix3x4 *boneMat3x4, uint nInf) { // must write contigously in AGP, and ASM is hardcoded... nlctassert(NL3D_RAWSKIN_NORMAL_OFF==12); nlctassert(NL3D_RAWSKIN_UV_OFF==24); /*extern uint TESTYOYO_NumRawSkinVertices4; TESTYOYO_NumRawSkinVertices4+= nInf; H_AUTO( TestYoyo_RawSkin4 );*/ // Since VertexPtr may be a AGP Ram, MUST NOT read into it! (mulAdd*() do it!) CVector tmpVert; #ifdef NL3D_RAWSKIN_PRECACHE for(;nInf>0;) { // number of vertices to process for this block. uint nBlockInf= min(NumCacheVertexNormal4, nInf); // next block. nInf-= nBlockInf; // cache the data in L1 cache. CFastMem::precache(src, nBlockInf * sizeof(CRawVertexNormalSkin4)); #else { uint nBlockInf= nInf; #endif // for all InfluencedVertices only. for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE) { // Vertex. boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, src->Weights[0], tmpVert); boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex.Pos, src->Weights[1], tmpVert); boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex.Pos, src->Weights[2], tmpVert); boneMat3x4[ src->MatrixId[3] ].mulAddPoint( src->Vertex.Pos, src->Weights[3], tmpVert); *(CVector*)(destVertexPtr)= tmpVert; // Normal. boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, src->Weights[0], tmpVert); boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Vertex.Normal, src->Weights[1], tmpVert); boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Vertex.Normal, src->Weights[2], tmpVert); boneMat3x4[ src->MatrixId[3] ].mulAddVector( src->Vertex.Normal, src->Weights[3], tmpVert); *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert; // UV copy. *(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV; } // NB: ASM not done for 4 vertices, cause very rare and negligeable ... } } // *************************************************************************** void CMeshMRMGeom::applyRawSkinWithNormal(CLod &lod, CRawSkinNormalCache &rawSkinLod, const CSkeletonModel *skeleton, uint8 *vbHard, float alphaLod) { nlassert(_Skinned); if(_SkinWeights.size()==0) return; // Some assert //=========================== // must have XYZ, Normal and UV only nlassert( _VBufferFinal.getVertexFormat() == (CVertexBuffer::PositionFlag | CVertexBuffer::NormalFlag | CVertexBuffer::TexCoord0Flag) ); nlassert( _VBufferFinal.getValueType(CVertexBuffer::TexCoord0) == CVertexBuffer::Float2 ); nlassert( _VBufferFinal.getVertexSize() ==NL3D_RAWSKIN_VERTEX_SIZE); // HardCoded for normalOff==12 (see applyArrayRawSkinNormal*) nlassert( _VBufferFinal.getNormalOff()==NL3D_RAWSKIN_NORMAL_OFF ); nlassert( _VBufferFinal.getTexCoordOff()==NL3D_RAWSKIN_UV_OFF ); // assert, code below is written especially for 4 per vertex. nlassert( NL3D_MESH_SKINNING_MAX_MATRIX==4 ); // Compute useful Matrix for this lod. //=========================== // Those arrays map the array of bones in skeleton. static vector<CMatrix3x4> boneMat3x4; computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton); // TestYoyo /*extern uint TESTYOYO_NumRawSkinVertices; TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices1.size(); TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices2.size(); TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices3.size(); TESTYOYO_NumRawSkinVertices+= rawSkinLod.Vertices4.size();*/ uint nInf; // Manage "SoftVertices" if(rawSkinLod.TotalSoftVertices) { // apply skinning into Temp RAM for vertices that are Src of Geomorph //=========================== static vector<uint8> tempSkin; uint tempVbSize= rawSkinLod.TotalSoftVertices*NL3D_RAWSKIN_VERTEX_SIZE; if(tempSkin.size() < tempVbSize) tempSkin.resize(tempVbSize); uint8 *destVertexPtr= &tempSkin[0]; // 1 Matrix nInf= rawSkinLod.SoftVertices[0]; if(nInf>0) { applyArrayRawSkinNormal1(&rawSkinLod.Vertices1[0], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // 2 Matrix nInf= rawSkinLod.SoftVertices[1]; if(nInf>0) { applyArrayRawSkinNormal2(&rawSkinLod.Vertices2[0], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // 3 Matrix nInf= rawSkinLod.SoftVertices[2]; if(nInf>0) { applyArrayRawSkinNormal3(&rawSkinLod.Vertices3[0], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // 4 Matrix nInf= rawSkinLod.SoftVertices[3]; if(nInf>0) { applyArrayRawSkinNormal4(&rawSkinLod.Vertices4[0], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // Fast Copy this into AGP Ram. NB: done before Geomorphs, because ensure some precaching this way!! //=========================== // Skin geomorphs. uint8 *vbHardStart= vbHard + rawSkinLod.Geomorphs.size()*NL3D_RAWSKIN_VERTEX_SIZE; // fast copy CFastMem::memcpy(vbHardStart, &tempSkin[0], tempVbSize); // Geomorphs directly into AGP Ram //=========================== clamp(alphaLod, 0.f, 1.f); float a= alphaLod; float a1= 1 - alphaLod; // Fast Geomorph applyGeomorphPosNormalUV0(rawSkinLod.Geomorphs, &tempSkin[0], vbHard, NL3D_RAWSKIN_VERTEX_SIZE, a, a1); } // Manage HardVertices if(rawSkinLod.TotalHardVertices) { // apply skinning directly into AGP RAM for vertices that are not Src of Geomorph //=========================== uint startId; // Skip Geomorphs and SoftVertices. uint8 *destVertexPtr= vbHard + (rawSkinLod.Geomorphs.size()+rawSkinLod.TotalSoftVertices)*NL3D_RAWSKIN_VERTEX_SIZE; // 1 Matrix nInf= rawSkinLod.HardVertices[0]; startId= rawSkinLod.SoftVertices[0]; if(nInf>0) { applyArrayRawSkinNormal1(&rawSkinLod.Vertices1[startId], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // 2 Matrix nInf= rawSkinLod.HardVertices[1]; startId= rawSkinLod.SoftVertices[1]; if(nInf>0) { applyArrayRawSkinNormal2(&rawSkinLod.Vertices2[startId], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // 3 Matrix nInf= rawSkinLod.HardVertices[2]; startId= rawSkinLod.SoftVertices[2]; if(nInf>0) { applyArrayRawSkinNormal3(&rawSkinLod.Vertices3[startId], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } // 4 Matrix nInf= rawSkinLod.HardVertices[3]; startId= rawSkinLod.SoftVertices[3]; if(nInf>0) { applyArrayRawSkinNormal4(&rawSkinLod.Vertices4[startId], destVertexPtr, &boneMat3x4[0], nInf); destVertexPtr+= nInf * NL3D_RAWSKIN_VERTEX_SIZE; } } }
// *************************************************************************** void CMeshMRMGeom::applySkinWithNormal(CLod &lod, const CSkeletonModel *skeleton) { nlassert(_Skinned); if(_SkinWeights.size()==0) return; // get vertexPtr / normalOff. //=========================== CVertexBufferReadWrite vba; _VBufferFinal.lock (vba); uint8 *destVertexPtr= (uint8*)vba.getVertexCoordPointer(); uint flags= _VBufferFinal.getVertexFormat(); sint32 vertexSize= _VBufferFinal.getVertexSize(); // must have XYZ and Normal. nlassert((flags & CVertexBuffer::PositionFlag) && (flags & CVertexBuffer::NormalFlag) ); // Compute offset of each component of the VB. sint32 normalOff; normalOff= _VBufferFinal.getNormalOff(); // compute src array. CMesh::CSkinWeight *srcSkinPtr; CVector *srcVertexPtr; CVector *srcNormalPtr= NULL; srcSkinPtr= &_SkinWeights[0]; srcVertexPtr= &_OriginalSkinVertices[0]; srcNormalPtr= &(_OriginalSkinNormals[0]); // Compute useful Matrix for this lod. //=========================== // Those arrays map the array of bones in skeleton. static vector<CMatrix3x4> boneMat3x4; computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton); // apply skinning. //=========================== // assert, code below is written especially for 4 per vertex. nlassert(NL3D_MESH_SKINNING_MAX_MATRIX==4); for(uint i=0;i<NL3D_MESH_SKINNING_MAX_MATRIX;i++) { uint nInf= (uint)lod.InfluencedVertices[i].size(); if( nInf==0 ) continue; uint32 *infPtr= &(lod.InfluencedVertices[i][0]); // TestYoyo /*extern uint TESTYOYO_NumStdSkinVertices; TESTYOYO_NumStdSkinVertices+= nInf;*/ // apply the skin to the vertices applyArraySkinNormalT(i, infPtr, srcSkinPtr, srcVertexPtr, srcNormalPtr, normalOff, destVertexPtr, boneMat3x4, vertexSize, nInf); } }
// *************************************************************************** void CMeshMRMGeom::applySkinWithTangentSpace(CLod &lod, const CSkeletonModel *skeleton, uint tangentSpaceTexCoord) { nlassert(_Skinned); if(_SkinWeights.size()==0) return; // get vertexPtr / normalOff / tangent space offset. //=========================== CVertexBufferReadWrite vba; _VBufferFinal.lock (vba); uint8 *destVertexPtr= (uint8*)vba.getVertexCoordPointer(); uint flags= _VBufferFinal.getVertexFormat(); sint32 vertexSize= _VBufferFinal.getVertexSize(); // must have XYZ. // if there's tangent space, there also must be a normal there. nlassert((flags & CVertexBuffer::PositionFlag) && (flags & CVertexBuffer::NormalFlag) ); // Compute offset of each component of the VB. sint32 normalOff; normalOff= _VBufferFinal.getNormalOff(); // tg space offset sint32 tgSpaceOff = _VBufferFinal.getTexCoordOff((uint8) tangentSpaceTexCoord); // compute src array. CMesh::CSkinWeight *srcSkinPtr; CVector *srcVertexPtr; CVector *srcNormalPtr; CVector *tgSpacePtr; // srcSkinPtr= &_SkinWeights[0]; srcVertexPtr= &_OriginalSkinVertices[0]; srcNormalPtr= &(_OriginalSkinNormals[0]); tgSpacePtr = &(_OriginalTGSpace[0]); // Compute useful Matrix for this lod. //=========================== // Those arrays map the array of bones in skeleton. static vector<CMatrix3x4> boneMat3x4; computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton); // apply skinning (with tangent space added) //=========================== // assert, code below is written especially for 4 per vertex. nlassert(NL3D_MESH_SKINNING_MAX_MATRIX==4); for(uint i=0;i<NL3D_MESH_SKINNING_MAX_MATRIX;i++) { uint nInf= (uint)lod.InfluencedVertices[i].size(); if( nInf==0 ) continue; uint32 *infPtr= &(lod.InfluencedVertices[i][0]); // apply the skin to the vertices applyArraySkinTangentSpaceT(i, infPtr, srcSkinPtr, srcVertexPtr, srcNormalPtr, tgSpacePtr, normalOff, tgSpaceOff, destVertexPtr, boneMat3x4, vertexSize, nInf); } }
// *************************************************************************** void CMeshMRMGeom::applySkin(CLod &lod, const CSkeletonModel *skeleton) { nlassert(_Skinned); if(_SkinWeights.size()==0) return; // get vertexPtr. //=========================== CVertexBufferReadWrite vba; _VBufferFinal.lock (vba); uint8 *destVertexPtr= (uint8*)vba.getVertexCoordPointer(); uint flags= _VBufferFinal.getVertexFormat(); sint32 vertexSize= _VBufferFinal.getVertexSize(); // must have XYZ. nlassert(flags & CVertexBuffer::PositionFlag); // compute src array. CMesh::CSkinWeight *srcSkinPtr; CVector *srcVertexPtr; srcSkinPtr= &_SkinWeights[0]; srcVertexPtr= &_OriginalSkinVertices[0]; // Compute useful Matrix for this lod. //=========================== // Those arrays map the array of bones in skeleton. static vector<CMatrix3x4> boneMat3x4; computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton); // apply skinning. //=========================== // assert, code below is written especially for 4 per vertex. nlassert(NL3D_MESH_SKINNING_MAX_MATRIX==4); for(uint i=0;i<NL3D_MESH_SKINNING_MAX_MATRIX;i++) { uint nInf= (uint)lod.InfluencedVertices[i].size(); if( nInf==0 ) continue; uint32 *infPtr= &(lod.InfluencedVertices[i][0]); // apply the skin to the vertices switch(i) { //========= case 0: // Special case for Vertices influenced by one matrix. Just copy result of mul. // for all InfluencedVertices only. for(;nInf>0;nInf--, infPtr++) { uint index= *infPtr; CMesh::CSkinWeight *srcSkin= srcSkinPtr + index; CVector *srcVertex= srcVertexPtr + index; uint8 *dstVertexVB= destVertexPtr + index * vertexSize; CVector *dstVertex= (CVector*)(dstVertexVB); // Vertex. boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, *dstVertex); } break; //========= case 1: // for all InfluencedVertices only. for(;nInf>0;nInf--, infPtr++) { uint index= *infPtr; CMesh::CSkinWeight *srcSkin= srcSkinPtr + index; CVector *srcVertex= srcVertexPtr + index; uint8 *dstVertexVB= destVertexPtr + index * vertexSize; CVector *dstVertex= (CVector*)(dstVertexVB); // Vertex. boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex); boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex); } break; //========= case 2: // for all InfluencedVertices only. for(;nInf>0;nInf--, infPtr++) { uint index= *infPtr; CMesh::CSkinWeight *srcSkin= srcSkinPtr + index; CVector *srcVertex= srcVertexPtr + index; uint8 *dstVertexVB= destVertexPtr + index * vertexSize; CVector *dstVertex= (CVector*)(dstVertexVB); // Vertex. boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex); boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex); boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex); } break; //========= case 3: // for all InfluencedVertices only. for(;nInf>0;nInf--, infPtr++) { uint index= *infPtr; CMesh::CSkinWeight *srcSkin= srcSkinPtr + index; CVector *srcVertex= srcVertexPtr + index; uint8 *dstVertexVB= destVertexPtr + index * vertexSize; CVector *dstVertex= (CVector*)(dstVertexVB); // Vertex. boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex); boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex); boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex); boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], *dstVertex); } break; } } }