void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData ) { ALIGN16( byte color0[16] ); ALIGN16( byte color1[16] ); ALIGN16( byte color2[16] ); ALIGN16( byte color3[16] ); ALIGN16( byte result[16] ); // mov esi, maxColor // mov edi, minColor __m128i t0, t1, t2, t3, t4, t5, t6, t7; t7 = _mm_setzero_si128(); //t7 = _mm_xor_si128(t7, t7); _mm_store_si128 ( (__m128i*) &result, t7 ); //t0 = _mm_load_si128 ( (__m128i*) maxColor ); t0 = _mm_cvtsi32_si128( *(int*)maxColor); // Bitwise AND __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask ); t0 = _mm_and_si128(t0, tt); t0 = _mm_unpacklo_epi8(t0, t7); t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); // Bitwise Logical OR t0 = _mm_or_si128(t0, t4); t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565 //t1 = _mm_load_si128 ( (__m128i*) minColor ); t1 = _mm_cvtsi32_si128( *(int*)minColor); t1 = _mm_and_si128(t1, tt); t1 = _mm_unpacklo_epi8(t1, t7); t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); t1 = _mm_or_si128(t1, t4); t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565 t2 = t0; t2 = _mm_packus_epi16(t2, t7); t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color0, t2 ); t6 = t0; t6 = _mm_add_epi16(t6, t0); t6 = _mm_add_epi16(t6, t1); // Multiply Packed Signed Integers and Store High Result __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 ); t6 = _mm_mulhi_epi16(t6, tw3); t6 = _mm_packus_epi16(t6, t7); t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color2, t6 ); t3 = t1; t3 = _mm_packus_epi16(t3, t7); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color1, t3 ); t1 = _mm_add_epi16(t1, t1); t0 = _mm_add_epi16(t0, t1); t0 = _mm_mulhi_epi16(t0, tw3); t0 = _mm_packus_epi16(t0, t7); t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color3, t0 ); __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0); __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1); __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2); // mov eax, 32 // mov esi, colorBlock int x = 32; //const byte *c = colorBlock; while (x >= 0) { t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0)); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t0 = t3; t6 = t5; // Compute Sum of Absolute Difference __m128i c0 = _mm_load_si128 ( (__m128i*) color0 ); t0 = _mm_sad_epu8(t0, c0); t6 = _mm_sad_epu8(t6, c0); // Pack with Signed Saturation t0 = _mm_packs_epi32 (t0, t6); t1 = t3; t6 = t5; __m128i c1 = _mm_load_si128 ( (__m128i*) color1 ); t1 = _mm_sad_epu8(t1, c1); t6 = _mm_sad_epu8(t6, c1); t1 = _mm_packs_epi32 (t1, t6); t2 = t3; t6 = t5; __m128i c2 = _mm_load_si128 ( (__m128i*) color2 ); t2 = _mm_sad_epu8(t2, c2); t6 = _mm_sad_epu8(t6, c2); t2 = _mm_packs_epi32 (t2, t6); __m128i c3 = _mm_load_si128 ( (__m128i*) color3 ); t3 = _mm_sad_epu8(t3, c3); t5 = _mm_sad_epu8(t5, c3); t3 = _mm_packs_epi32 (t3, t5); t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16)); t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c0); t7 = _mm_sad_epu8(t7, c0); t6 = _mm_packs_epi32 (t6, t7); t0 = _mm_packs_epi32 (t0, t6); // d0 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c1); t7 = _mm_sad_epu8(t7, c1); t6 = _mm_packs_epi32 (t6, t7); t1 = _mm_packs_epi32 (t1, t6); // d1 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c2); t7 = _mm_sad_epu8(t7, c2); t6 = _mm_packs_epi32 (t6, t7); t2 = _mm_packs_epi32 (t2, t6); // d2 t4 = _mm_sad_epu8(t4, c3); t5 = _mm_sad_epu8(t5, c3); t4 = _mm_packs_epi32 (t4, t5); t3 = _mm_packs_epi32 (t3, t4); // d3 t7 = _mm_load_si128 ( (__m128i*) result ); t7 = _mm_slli_epi32( t7, 16); t4 = t0; t5 = t1; // Compare Packed Signed Integers for Greater Than t0 = _mm_cmpgt_epi16(t0, t3); // b0 t1 = _mm_cmpgt_epi16(t1, t2); // b1 t4 = _mm_cmpgt_epi16(t4, t2); // b2 t5 = _mm_cmpgt_epi16(t5, t3); // b3 t2 = _mm_cmpgt_epi16(t2, t3); // b4 t4 = _mm_and_si128(t4, t1); // x0 t5 = _mm_and_si128(t5, t0); // x1 t2 = _mm_and_si128(t2, t0); // x2 t4 = _mm_or_si128(t4, t5); t2 = _mm_and_si128(t2, w1); t4 = _mm_and_si128(t4, w2); t2 = _mm_or_si128(t2, t4); t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 )); // Unpack Low Data t2 = _mm_unpacklo_epi16 ( t2, w0); t5 = _mm_unpacklo_epi16 ( t5, w0); //t5 = _mm_slli_si128 ( t5, 8); t5 = _mm_slli_epi32( t5, 8); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t2); _mm_store_si128 ( (__m128i*) &result, t7 ); x -=32; } t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 )); t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 )); t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 )); t4 = _mm_slli_epi32 ( t4, 2); t5 = _mm_slli_epi32 ( t5, 4); t6 = _mm_slli_epi32 ( t6, 6); t7 = _mm_or_si128(t7, t4); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t6); //_mm_store_si128 ( (__m128i*) outData, t7 ); int r = _mm_cvtsi128_si32 (t7); memcpy(outData, &r, 4); // Anything better ? outData += 4; }
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor ) { __m128i t0, t1, t3, t4, t6, t7; // get bounding box // ---------------- // load the first row t0 = _mm_load_si128 ( (__m128i*) colorBlock ); t1 = _mm_load_si128 ( (__m128i*) colorBlock ); __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) ); // Minimum of Packed Unsigned Byte Integers t0 = _mm_min_epu8 ( t0, t16); // Maximum of Packed Unsigned Byte Integers t1 = _mm_max_epu8 ( t1, t16); __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) ); t0 = _mm_min_epu8 ( t0, t32); t1 = _mm_max_epu8 ( t1, t32); __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) ); t0 = _mm_min_epu8 ( t0, t48); t1 = _mm_max_epu8 ( t1, t48); // Shuffle Packed Doublewords t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t3); t1 = _mm_max_epu8 ( t1, t4); // Shuffle Packed Low Words t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t6); t1 = _mm_max_epu8 ( t1, t7); // inset the bounding box // ---------------------- // Unpack Low Data //__m128i t66 = _mm_set1_epi8( 0 ); __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 ); t0 = _mm_unpacklo_epi8(t0, t66); t1 = _mm_unpacklo_epi8(t1, t66); // copy (movdqa) //__m128i t2 = _mm_load_si128 ( &t1 ); __m128i t2 = t1; // Subtract Packed Integers t2 = _mm_sub_epi16(t2, t0); // Shift Packed Data Right Logical t2 = _mm_srli_epi16(t2, INSET_SHIFT); // Add Packed Integers t0 = _mm_add_epi16(t0, t2); t1 = _mm_sub_epi16(t1, t2); // Pack with Unsigned Saturation t0 = _mm_packus_epi16(t0, t0); t1 = _mm_packus_epi16(t1, t1); // store bounding box extents // -------------------------- _mm_store_si128 ( (__m128i*) minColor, t0 ); _mm_store_si128 ( (__m128i*) maxColor, t1 ); }
void Matrix4x4FromQuaternionAndTransform(Matrix4x4SSE &matrix, Vector4SSE &q, Vector4SSE &t) { Vector4SSE pq = q; Matrix4x4SSE *pm = &matrix; __asm { mov edi, pm // edi = &matrix movaps xmm0, [pq] // xmm0 = q.x, q.y, q.z, q.w //movaps xmm6, [g_VectorSSE_One] // xmm6 = t.x, t.y, t.z, t.w //movaps xmm6, [g_VectorSSE_Zero] // xmm6 = 0, 0, 0, 0 movaps xmm1, xmm0 // xmm1 = x, y, z, w addps xmm1, xmm1 // xmm1 = x2, y2, z2, w2 #ifdef _ENABLE_SSE2_ // SSE2 version pshufd xmm2, xmm0, R_SHUFFLE_D( 1, 0, 0, 1 ) // xmm2 = y, x, x, y pshufd xmm3, xmm1, R_SHUFFLE_D( 1, 1, 2, 2 ) // xmm3 = y2, y2, z2, z2 #else // SSE Version movaps xmm2, xmm0 shufps xmm2, xmm0, R_SHUFFLE_PS( 1, 0, 0, 1 ) // xmm2 = y, x, x, y movaps xmm3, xmm1 shufps xmm3, xmm1, R_SHUFFLE_PS( 1, 1, 2, 2 ) // xmm3 = y2, y2, z2, z2 #endif // _ENABLE_SSE2_ mulps xmm2, xmm3 // xmm2 = yy2, xy2, xz2, yz2 #ifdef _ENABLE_SSE2_ // SSE2 version pshufd xmm4, xmm0, R_SHUFFLE_D( 2, 3, 3, 3 ) // xmm4 = z, w, w, w pshufd xmm5, xmm1, R_SHUFFLE_D( 2, 2, 1, 0 ) // xmm5 = z2, z2, y2, x2 #else // SSE version movaps xmm4, xmm0 shufps xmm4, xmm0, R_SHUFFLE_PS( 2, 3, 3, 3 ) // xmm4 = z, w, w, w movaps xmm5, xmm1 shufps xmm5, xmm1, R_SHUFFLE_PS( 2, 2, 1, 0 ) // xmm5 = z2, z2, y2, x2 #endif // _ENABLE_SSE2_ mulps xmm4, xmm5 // xmm4 = zz2, wz2, wy2, wx2 mulss xmm0, xmm1 // xmm0 = xx2, y2, z2, w2 // calculate the last two elements of the third row movss xmm7, [g_VectorSSE_One] // xmm7 = 1, 0, 0, 0 subss xmm7, xmm0 // xmm7 = -xx2+1, 0, 0, 0 subss xmm7, xmm2 // xmm7 = -xx2-yy2+1, 0, 0, 0 //shufps xmm7, xmm6, R_SHUFFLE_PS( 0, 1, 2, 3 ) // xmm7 = -xx2-yy2+1, 0, t.z, t.w // calculate first row xorps xmm2, [g_VectorSSE_quat2mat_x0] // xmm2 = yy2, -xy2, -xz2, -yz2 xorps xmm4, [g_VectorSSE_quat2mat_x1] // xmm4 = -zz2, wz2, -wy2, -wx2 addss xmm4, [g_VectorSSE_One] // xmm4 = -zz2+1, wz2, -wy2, -wx2 movaps xmm3, xmm4 // xmm3 = -zz2+1, wz2, -wy2, -wx2 subps xmm3, xmm2 // xmm3 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2 //movaps [matrix.m_Vec0], xmm3 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2 //mov [matrix.m_Vec0.w], 0x00000000 movaps [edi], xmm3 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2 mov dword ptr [edi + 3*4], 0x00000000 // not working, why? //movss [edi + 3*4], xmm6 // calculate second row movss xmm2, xmm0 // xmm2 = xx2, -xy2, -xz2, -yz2 xorps xmm4, [g_VectorSSE_quat2mat_x2] // xmm4 = -zz2+1, -wz2, wy2, wx2 subps xmm4, xmm2 // xmm4 = -xx2-zz2+1, xy2-wz2, xz2+wy2, yz2+wx2 //shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) // xmm6 = t.y, t.z, t.w, t.x shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 0, 3, 2 ) // xmm4 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2 // movaps [matrix.m_Vec1], xmm4 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2 // mov [matrix.m_Vec1.w], 0x00000000 movaps [edi + 16], xmm4 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2 mov dword ptr [edi + 16 + 3*4], 0x00000000 //movss [edi + 16 + 3*4], xmm6 //movss [edi+1*16+3*4], xmm6 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, t.y // calculate third row movhlps xmm3, xmm4 // xmm3 = yz2+wx2, xz2+wy2, xz2-wy2, yz2-wx2 shufps xmm3, xmm7, R_SHUFFLE_PS( 1, 3, 0, 2 ) // xmm3 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z //movaps [matrix.m_Vec2], xmm3 // row2 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z //mov [matrix.m_Vec2.w], 0x00000000 movaps [edi + 32], xmm3 // row2 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z mov dword ptr [edi + 32 + 3*4], 0x00000000 //movss [edi + 32 + 3*4], xmm6 } matrix.m_Vec3 = t; }