/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16(uint8_t* dest, uint8_t* orig, size_t size) { size_t i, j, k; size_t neblock, numof16belem; __m128i xmm1[16], xmm2[16]; neblock = size / 16; numof16belem = neblock / 16; for (i = 0, k = 0; i < numof16belem; i++, k += 16) { /* Load the first 128 bytes in 16 XMM registrers */ for (j = 0; j < 16; j++) { xmm1[j] = ((__m128i *)orig)[j*numof16belem+i]; } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); /* Compute the hi 32 bytes */ xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); /* Compute the hi 32 bytes */ xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); } /* Store the result vectors in proper order */ ((__m128i *)dest)[k+0] = xmm1[0]; ((__m128i *)dest)[k+1] = xmm1[8]; ((__m128i *)dest)[k+2] = xmm1[4]; ((__m128i *)dest)[k+3] = xmm1[12]; ((__m128i *)dest)[k+4] = xmm1[2]; ((__m128i *)dest)[k+5] = xmm1[10]; ((__m128i *)dest)[k+6] = xmm1[6]; ((__m128i *)dest)[k+7] = xmm1[14]; ((__m128i *)dest)[k+8] = xmm1[1]; ((__m128i *)dest)[k+9] = xmm1[9]; ((__m128i *)dest)[k+10] = xmm1[5]; ((__m128i *)dest)[k+11] = xmm1[13]; ((__m128i *)dest)[k+12] = xmm1[3]; ((__m128i *)dest)[k+13] = xmm1[11]; ((__m128i *)dest)[k+14] = xmm1[7]; ((__m128i *)dest)[k+15] = xmm1[15]; } }
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData ) { ALIGN16( byte color0[16] ); ALIGN16( byte color1[16] ); ALIGN16( byte color2[16] ); ALIGN16( byte color3[16] ); ALIGN16( byte result[16] ); // mov esi, maxColor // mov edi, minColor __m128i t0, t1, t2, t3, t4, t5, t6, t7; t7 = _mm_setzero_si128(); //t7 = _mm_xor_si128(t7, t7); _mm_store_si128 ( (__m128i*) &result, t7 ); //t0 = _mm_load_si128 ( (__m128i*) maxColor ); t0 = _mm_cvtsi32_si128( *(int*)maxColor); // Bitwise AND __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask ); t0 = _mm_and_si128(t0, tt); t0 = _mm_unpacklo_epi8(t0, t7); t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); // Bitwise Logical OR t0 = _mm_or_si128(t0, t4); t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565 //t1 = _mm_load_si128 ( (__m128i*) minColor ); t1 = _mm_cvtsi32_si128( *(int*)minColor); t1 = _mm_and_si128(t1, tt); t1 = _mm_unpacklo_epi8(t1, t7); t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); t1 = _mm_or_si128(t1, t4); t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565 t2 = t0; t2 = _mm_packus_epi16(t2, t7); t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color0, t2 ); t6 = t0; t6 = _mm_add_epi16(t6, t0); t6 = _mm_add_epi16(t6, t1); // Multiply Packed Signed Integers and Store High Result __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 ); t6 = _mm_mulhi_epi16(t6, tw3); t6 = _mm_packus_epi16(t6, t7); t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color2, t6 ); t3 = t1; t3 = _mm_packus_epi16(t3, t7); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color1, t3 ); t1 = _mm_add_epi16(t1, t1); t0 = _mm_add_epi16(t0, t1); t0 = _mm_mulhi_epi16(t0, tw3); t0 = _mm_packus_epi16(t0, t7); t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color3, t0 ); __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0); __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1); __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2); // mov eax, 32 // mov esi, colorBlock int x = 32; //const byte *c = colorBlock; while (x >= 0) { t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0)); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t0 = t3; t6 = t5; // Compute Sum of Absolute Difference __m128i c0 = _mm_load_si128 ( (__m128i*) color0 ); t0 = _mm_sad_epu8(t0, c0); t6 = _mm_sad_epu8(t6, c0); // Pack with Signed Saturation t0 = _mm_packs_epi32 (t0, t6); t1 = t3; t6 = t5; __m128i c1 = _mm_load_si128 ( (__m128i*) color1 ); t1 = _mm_sad_epu8(t1, c1); t6 = _mm_sad_epu8(t6, c1); t1 = _mm_packs_epi32 (t1, t6); t2 = t3; t6 = t5; __m128i c2 = _mm_load_si128 ( (__m128i*) color2 ); t2 = _mm_sad_epu8(t2, c2); t6 = _mm_sad_epu8(t6, c2); t2 = _mm_packs_epi32 (t2, t6); __m128i c3 = _mm_load_si128 ( (__m128i*) color3 ); t3 = _mm_sad_epu8(t3, c3); t5 = _mm_sad_epu8(t5, c3); t3 = _mm_packs_epi32 (t3, t5); t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16)); t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c0); t7 = _mm_sad_epu8(t7, c0); t6 = _mm_packs_epi32 (t6, t7); t0 = _mm_packs_epi32 (t0, t6); // d0 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c1); t7 = _mm_sad_epu8(t7, c1); t6 = _mm_packs_epi32 (t6, t7); t1 = _mm_packs_epi32 (t1, t6); // d1 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c2); t7 = _mm_sad_epu8(t7, c2); t6 = _mm_packs_epi32 (t6, t7); t2 = _mm_packs_epi32 (t2, t6); // d2 t4 = _mm_sad_epu8(t4, c3); t5 = _mm_sad_epu8(t5, c3); t4 = _mm_packs_epi32 (t4, t5); t3 = _mm_packs_epi32 (t3, t4); // d3 t7 = _mm_load_si128 ( (__m128i*) result ); t7 = _mm_slli_epi32( t7, 16); t4 = t0; t5 = t1; // Compare Packed Signed Integers for Greater Than t0 = _mm_cmpgt_epi16(t0, t3); // b0 t1 = _mm_cmpgt_epi16(t1, t2); // b1 t4 = _mm_cmpgt_epi16(t4, t2); // b2 t5 = _mm_cmpgt_epi16(t5, t3); // b3 t2 = _mm_cmpgt_epi16(t2, t3); // b4 t4 = _mm_and_si128(t4, t1); // x0 t5 = _mm_and_si128(t5, t0); // x1 t2 = _mm_and_si128(t2, t0); // x2 t4 = _mm_or_si128(t4, t5); t2 = _mm_and_si128(t2, w1); t4 = _mm_and_si128(t4, w2); t2 = _mm_or_si128(t2, t4); t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 )); // Unpack Low Data t2 = _mm_unpacklo_epi16 ( t2, w0); t5 = _mm_unpacklo_epi16 ( t5, w0); //t5 = _mm_slli_si128 ( t5, 8); t5 = _mm_slli_epi32( t5, 8); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t2); _mm_store_si128 ( (__m128i*) &result, t7 ); x -=32; } t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 )); t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 )); t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 )); t4 = _mm_slli_epi32 ( t4, 2); t5 = _mm_slli_epi32 ( t5, 4); t6 = _mm_slli_epi32 ( t6, 6); t7 = _mm_or_si128(t7, t4); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t6); //_mm_store_si128 ( (__m128i*) outData, t7 ); int r = _mm_cvtsi128_si32 (t7); memcpy(outData, &r, 4); // Anything better ? outData += 4; }
// Does one or two inverse transforms. static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two inverse // transforms in parallel). In the case of only one inverse transform, the // second half of the vectors will just contain random value we'll never // use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((const __m128i*)&in[0]); in1 = _mm_loadl_epi64((const __m128i*)&in[4]); in2 = _mm_loadl_epi64((const __m128i*)&in[8]); in3 = _mm_loadl_epi64((const __m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'ref' and store. { const __m128i zero = _mm_setzero_si128(); // Load the reference(s). __m128i ref0, ref1, ref2, ref3; if (do_two) { // Load eight bytes/pixels per line. ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. ref0 = _mm_cvtsi32_si128(*(const int*)&ref[0 * BPS]); ref1 = _mm_cvtsi32_si128(*(const int*)&ref[1 * BPS]); ref2 = _mm_cvtsi32_si128(*(const int*)&ref[2 * BPS]); ref3 = _mm_cvtsi32_si128(*(const int*)&ref[3 * BPS]); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); ref1 = _mm_unpacklo_epi8(ref1, zero); ref2 = _mm_unpacklo_epi8(ref2, zero); ref3 = _mm_unpacklo_epi8(ref3, zero); // Add the inverse transform(s). ref0 = _mm_add_epi16(ref0, T0); ref1 = _mm_add_epi16(ref1, T1); ref2 = _mm_add_epi16(ref2, T2); ref3 = _mm_add_epi16(ref3, T3); // Unsigned saturate to 8b. ref0 = _mm_packus_epi16(ref0, ref0); ref1 = _mm_packus_epi16(ref1, ref1); ref2 = _mm_packus_epi16(ref2, ref2); ref3 = _mm_packus_epi16(ref3, ref3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); } } }
mlib_status __mlib_VectorConvert_S32_S16_Mod( mlib_s32 *z, const mlib_s16 *x, mlib_s32 n) { if (n < 1) return (MLIB_FAILURE); mlib_s32 i, ax, az, nstep, n1, n2, n3; mlib_s16 *px = (mlib_s16 *)x; mlib_s32 *pz = (mlib_s32 *)z; __m128i xbuf, zlo, zhi, zero; zero = _mm_setzero_si128(); ax = (mlib_addr)x & 15; az = (mlib_addr)z & 15; nstep = 16 / sizeof (mlib_s16); n1 = ((16 - ax) & 15) / sizeof (mlib_s16); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { *pz++ = *px++; } } else { for (i = 0; i < n1; i++) { *pz++ = *px++; } if ((ax * 2 & 15) == az) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); zlo = _mm_unpacklo_epi16(zero, xbuf); zhi = _mm_unpackhi_epi16(zero, xbuf); zlo = _mm_srai_epi32(zlo, 16); zhi = _mm_srai_epi32(zhi, 16); _mm_store_si128((__m128i *)pz, zlo); _mm_store_si128((__m128i *)pz + 1, zhi); px += nstep; pz += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); zlo = _mm_unpacklo_epi16(zero, xbuf); zhi = _mm_unpackhi_epi16(zero, xbuf); zlo = _mm_srai_epi32(zlo, 16); zhi = _mm_srai_epi32(zhi, 16); _mm_storeu_si128((__m128i *)pz, zlo); _mm_storeu_si128((__m128i *)pz + 1, zhi); px += nstep; pz += nstep; } } for (i = 0; i < n3; i++) { *pz++ = *px++; } } return (MLIB_SUCCESS); }
FourColorVals(const uint16_t *p) { assert(((uintptr_t)p & 7) == 0);//assert aligned vec = _mm_unpacklo_epi16(_mm_loadl_epi64((__m128i*)p),_mm_setzero_si128()); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
static LW_FORCEINLINE void fill_rgb_buffer_sse41( BYTE *rgb_buffer, BYTE *lw48_ptr ) { static const USHORT LW_ALIGN(16) PW_32768[8] = { 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768 }; static const short LW_ALIGN(16) PW_28672[8] = { 28672, 28672, 28672, 28672, 28672, 28672, 28672, 28672 }; static const short LW_ALIGN(16) PW_9539[8] = { 9539, 9539, 9539, 9539, 9539, 9539, 9539, 9539 }; static const short LW_ALIGN(16) PW_13074[8] = { 13074, 13074, 13074, 13074, 13074, 13074, 13074, 13074 }; static const short LW_ALIGN(16) PW_16531[8] = { 16531, 16531, 16531, 16531, 16531, 16531, 16531, 16531 }; static const short LW_ALIGN(16) PW_M3203_M6808[8] = { -3203, -6808, -3203, -6808, -3203, -6808, -3203, -6808 }; static const int LW_ALIGN(16) PD_1_20[4] = { (1<<20), (1<<20), (1<<20), (1<<20) }; static const char LW_ALIGN(16) LW48_SHUFFLE[3][16] = { { 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11 }, { 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13 }, { 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15 } }; __m128i x0, x1, x2, x3, x4, x5, x6, x7; x5 = _mm_loadu_si128((__m128i *)(lw48_ptr + 0)); x6 = _mm_loadu_si128((__m128i *)(lw48_ptr + 16)); x7 = _mm_loadu_si128((__m128i *)(lw48_ptr + 32)); x0 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02); x0 = _mm_blend_epi16(x0, x7, 0x20+0x04); x1 = _mm_blend_epi16(x5, x6, 0x20+0x04); x1 = _mm_blend_epi16(x1, x7, 0x40+0x08+0x01); x2 = _mm_blend_epi16(x5, x6, 0x40+0x08+0x01); x2 = _mm_blend_epi16(x2, x7, 0x80+0x10+0x02); x0 = _mm_shuffle_epi8(x0, _mm_load_si128((__m128i*)LW48_SHUFFLE[0])); /* Y */ x1 = _mm_shuffle_epi8(x1, _mm_load_si128((__m128i*)LW48_SHUFFLE[1])); /* Cb */ x2 = _mm_shuffle_epi8(x2, _mm_load_si128((__m128i*)LW48_SHUFFLE[2])); /* Cr */ x0 = _mm_sub_epi16(x0, _mm_load_si128((__m128i*)PW_32768)); x1 = _mm_sub_epi16(x1, _mm_load_si128((__m128i*)PW_32768)); x2 = _mm_sub_epi16(x2, _mm_load_si128((__m128i*)PW_32768)); /* y_tmp = ((y - 4096) * 9539) */ /* = ((y - 32768) + (32768 - 4096)) * 9539 */ /* = ((y - 32768) * 9539 + 28672 * 9539 */ x3 = _mm_unpacklo_epi16(x0, _mm_load_si128((__m128i*)PW_28672)); x4 = _mm_unpackhi_epi16(x0, _mm_load_si128((__m128i*)PW_28672)); x3 = _mm_madd_epi16(x3, _mm_load_si128((__m128i*)PW_9539)); x4 = _mm_madd_epi16(x4, _mm_load_si128((__m128i*)PW_9539)); /* G = ((y_tmp + ((cb-32768) * -3203) + ((cr-32768) * -6808)) + (1<<20)) >> 21 */ x5 = _mm_unpacklo_epi16(x1, x2); x6 = _mm_unpackhi_epi16(x1, x2); x5 = _mm_madd_epi16(x5, _mm_load_si128((__m128i*)PW_M3203_M6808)); x6 = _mm_madd_epi16(x6, _mm_load_si128((__m128i*)PW_M3203_M6808)); x5 = _mm_add_epi32(x5, x3); x6 = _mm_add_epi32(x6, x4); x5 = _mm_add_epi32(x5, _mm_load_si128((__m128i*)PD_1_20)); x6 = _mm_add_epi32(x6, _mm_load_si128((__m128i*)PD_1_20)); x5 = _mm_srai_epi32(x5, 21); x6 = _mm_srai_epi32(x6, 21); x5 = _mm_packs_epi32(x5, x6); _mm_store_si128((__m128i*)(rgb_buffer + 16), x5); /* R = ((y_tmp + ((cr-32768) * 13074) + (1<<20)) >> 21 */ x0 = _mm_mullo_epi16(x2, _mm_load_si128((__m128i*)PW_13074)); x7 = _mm_mulhi_epi16(x2, _mm_load_si128((__m128i*)PW_13074)); x6 = _mm_unpacklo_epi16(x0, x7); x7 = _mm_unpackhi_epi16(x0, x7); x6 = _mm_add_epi32(x6, x3); x7 = _mm_add_epi32(x7, x4); x6 = _mm_add_epi32(x6, _mm_load_si128((__m128i*)PD_1_20)); x7 = _mm_add_epi32(x7, _mm_load_si128((__m128i*)PD_1_20)); x6 = _mm_srai_epi32(x6, 21); x7 = _mm_srai_epi32(x7, 21); x6 = _mm_packs_epi32(x6, x7); _mm_store_si128((__m128i*)(rgb_buffer + 32), x6); /* B = ((y_tmp + ((cb-32768) * 16531) + (1<<20)) >> 21 */ x2 = _mm_mullo_epi16(x1, _mm_load_si128((__m128i*)PW_16531)); x7 = _mm_mulhi_epi16(x1, _mm_load_si128((__m128i*)PW_16531)); x0 = _mm_unpacklo_epi16(x2, x7); x7 = _mm_unpackhi_epi16(x2, x7); x0 = _mm_add_epi32(x0, x3); x7 = _mm_add_epi32(x7, x4); x0 = _mm_add_epi32(x0, _mm_load_si128((__m128i*)PD_1_20)); x7 = _mm_add_epi32(x7, _mm_load_si128((__m128i*)PD_1_20)); x0 = _mm_srai_epi32(x0, 21); x7 = _mm_srai_epi32(x7, 21); x7 = _mm_packs_epi32(x0, x7); _mm_store_si128((__m128i*)(rgb_buffer + 0), x7); }
void tuned_ConvertULY4ToRGB(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pYBegin, const uint8_t *pUBegin, const uint8_t *pVBegin, size_t cbWidth, ssize_t scbStride) { const int shift = 13; __m128i xy2rgb = _mm_set2_epi16_shift((-16 * C::Y2RGB + 0.5) / 0xff, C::Y2RGB, shift); __m128i vu2r = _mm_set2_epi16_shift(C::V2R, 0, shift); __m128i vu2g = _mm_set2_epi16_shift(C::V2G, C::U2G, shift); __m128i vu2b = _mm_set2_epi16_shift(0, C::U2B, shift); auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pDstBegin; p != pDstEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP * 4) { __m128i yy = _mm_cvtsi32_si128(*(const int *)y); __m128i uu = _mm_cvtsi32_si128(*(const int *)u); __m128i vv = _mm_cvtsi32_si128(*(const int *)v); __m128i xy = _mm_unpacklo_epi8(_mm_unpacklo_epi8(yy, _mm_setone_si128()), _mm_setzero_si128()); // 00 ff 00 Y3 00 ff 00 Y2 00 ff 00 Y1 00 ff 00 Y0 __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); _mm_storeu_si128((__m128i *)pp, bgrx); } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, rr)); __m128i bgr = _mm_shuffle_epi8(bgrx, _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0)); _mm_storeu_si128((__m128i *)pp, bgr); } #endif else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); _mm_storeu_si128((__m128i *)pp, xrgb); } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_setone_si128(), rr), _mm_unpacklo_epi8(gg, bb)); __m128i rgb = _mm_shuffle_epi8(xrgb, _mm_set_epi8(-1, -1, -1, -1, 15, 14, 13, 11, 10, 9, 7, 6, 5, 3, 2, 1)); _mm_storeu_si128((__m128i *)pp, rgb); } #endif y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i xy = _mm_cvtsi32_si128(*y | 0x00ff0000); __m128i uu = _mm_cvtsi32_si128(*u); __m128i vv = _mm_cvtsi32_si128(*v); __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); *(uint32_t *)pp = _mm_cvtsi128_si32(bgrx); } else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); *(uint32_t *)pp = _mm_cvtsi128_si32(xrgb); } else if (std::is_same<T, CBGRColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { *(pp + T::B) = (uint8_t)_mm_cvtsi128_si32(bb); *(pp + T::G) = (uint8_t)_mm_cvtsi128_si32(gg); *(pp + T::R) = (uint8_t)_mm_cvtsi128_si32(rr); } y += 1; u += 1; v += 1; } } }
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); __m128i in0, in1, in2, in3; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); in2 = _mm_slli_epi16(in2, 4); in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); in0 = _mm_add_epi16(in0, mask); in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); const __m128i r3 = _mm_sub_epi16(in0, in3); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); // Combine and transpose const __m128i res0 = _mm_packs_epi32(w0, w2); const __m128i res1 = _mm_packs_epi32(w4, w6); // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 if (0 == pass) { // Extract values in the high part for second pass as transform code // only uses the first four values. in1 = _mm_unpackhi_epi64(in0, in0); in3 = _mm_unpackhi_epi64(in2, in2); } else { // Post-condition output and store it (v + 1) >> 2, taking advantage // of the fact 1/3 are stored just after 0/2. __m128i out01 = _mm_add_epi16(in0, kOne); __m128i out23 = _mm_add_epi16(in2, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } } }
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // We process eight columns (transposed rows in second pass) at a time. int column_start; for (column_start = 0; column_start < 16; column_start += 8) { __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i in08, in09, in10, in11, in12, in13, in14, in15; __m128i input0, input1, input2, input3, input4, input5, input6, input7; __m128i step1_0, step1_1, step1_2, step1_3; __m128i step1_4, step1_5, step1_6, step1_7; __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; __m128i step3_0, step3_1, step3_2, step3_3; __m128i step3_4, step3_5, step3_6, step3_7; __m128i res00, res01, res02, res03, res04, res05, res06, res07; __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); in02 = _mm_slli_epi16(in02, 2); in03 = _mm_slli_epi16(in03, 2); in04 = _mm_slli_epi16(in04, 2); in05 = _mm_slli_epi16(in05, 2); in06 = _mm_slli_epi16(in06, 2); in07 = _mm_slli_epi16(in07, 2); in08 = _mm_slli_epi16(in08, 2); in09 = _mm_slli_epi16(in09, 2); in10 = _mm_slli_epi16(in10, 2); in11 = _mm_slli_epi16(in11, 2); in12 = _mm_slli_epi16(in12, 2); in13 = _mm_slli_epi16(in13, 2); in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); in02 = _mm_add_epi16(in02, kOne); in03 = _mm_add_epi16(in03, kOne); in04 = _mm_add_epi16(in04, kOne); in05 = _mm_add_epi16(in05, kOne); in06 = _mm_add_epi16(in06, kOne); in07 = _mm_add_epi16(in07, kOne); in08 = _mm_add_epi16(in08, kOne); in09 = _mm_add_epi16(in09, kOne); in10 = _mm_add_epi16(in10, kOne); in11 = _mm_add_epi16(in11, kOne); in12 = _mm_add_epi16(in12, kOne); in13 = _mm_add_epi16(in13, kOne); in14 = _mm_add_epi16(in14, kOne); in15 = _mm_add_epi16(in15, kOne); in00 = _mm_srai_epi16(in00, 2); in01 = _mm_srai_epi16(in01, 2); in02 = _mm_srai_epi16(in02, 2); in03 = _mm_srai_epi16(in03, 2); in04 = _mm_srai_epi16(in04, 2); in05 = _mm_srai_epi16(in05, 2); in06 = _mm_srai_epi16(in06, 2); in07 = _mm_srai_epi16(in07, 2); in08 = _mm_srai_epi16(in08, 2); in09 = _mm_srai_epi16(in09, 2); in10 = _mm_srai_epi16(in10, 2); in11 = _mm_srai_epi16(in11, 2); in12 = _mm_srai_epi16(in12, 2); in13 = _mm_srai_epi16(in13, 2); in14 = _mm_srai_epi16(in14, 2); in15 = _mm_srai_epi16(in15, 2); } in += 8; // Calculate input for the first 8 results. { input0 = _mm_add_epi16(in00, in15); input1 = _mm_add_epi16(in01, in14); input2 = _mm_add_epi16(in02, in13); input3 = _mm_add_epi16(in03, in12); input4 = _mm_add_epi16(in04, in11); input5 = _mm_add_epi16(in05, in10); input6 = _mm_add_epi16(in06, in09); input7 = _mm_add_epi16(in07, in08); } // Calculate input for the next 8 results. { step1_0 = _mm_sub_epi16(in07, in08); step1_1 = _mm_sub_epi16(in06, in09); step1_2 = _mm_sub_epi16(in05, in10); step1_3 = _mm_sub_epi16(in04, in11); step1_4 = _mm_sub_epi16(in03, in12); step1_5 = _mm_sub_epi16(in02, in13); step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } // Work on the first eight values; fdct8_1d(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); const __m128i q3 = _mm_add_epi16(input3, input4); const __m128i q4 = _mm_sub_epi16(input3, input4); const __m128i q5 = _mm_sub_epi16(input2, input5); const __m128i q6 = _mm_sub_epi16(input1, input6); const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res00 = _mm_packs_epi32(w0, w1); res08 = _mm_packs_epi32(w2, w3); res04 = _mm_packs_epi32(w4, w5); res12 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res02 = _mm_packs_epi32(w0, w1); res14 = _mm_packs_epi32(w2, w3); res10 = _mm_packs_epi32(w4, w5); res06 = _mm_packs_epi32(w6, w7); } } // Work on the next eight values; step1 -> odd_results { // step 2 { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_2 = _mm_packs_epi32(w0, w1); step2_3 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_5 = _mm_packs_epi32(w0, w1); step2_4 = _mm_packs_epi32(w2, w3); } // step 3 { step3_0 = _mm_add_epi16(step1_0, step2_3); step3_1 = _mm_add_epi16(step1_1, step2_2); step3_2 = _mm_sub_epi16(step1_1, step2_2); step3_3 = _mm_sub_epi16(step1_0, step2_3); step3_4 = _mm_sub_epi16(step1_7, step2_4); step3_5 = _mm_sub_epi16(step1_6, step2_5); step3_6 = _mm_add_epi16(step1_6, step2_5); step3_7 = _mm_add_epi16(step1_7, step2_4); } // step 4 { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_1 = _mm_packs_epi32(w0, w1); step2_2 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_6 = _mm_packs_epi32(w0, w1); step2_5 = _mm_packs_epi32(w2, w3); } // step 5 { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); step1_2 = _mm_sub_epi16(step3_3, step2_2); step1_3 = _mm_add_epi16(step3_3, step2_2); step1_4 = _mm_add_epi16(step3_4, step2_5); step1_5 = _mm_sub_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } // step 6 { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res01 = _mm_packs_epi32(w0, w1); res09 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res05 = _mm_packs_epi32(w0, w1); res13 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res11 = _mm_packs_epi32(w0, w1); res03 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res15 = _mm_packs_epi32(w0, w1); res07 = _mm_packs_epi32(w2, w3); } } // Transpose the results, do it as two 8x8 transposes. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); } { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 // Store results _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); } out += 8*16; } // Setup in/out for next pass. in = intermediate; out = output; } }
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); in2 = _mm_slli_epi16(in2, 2); in3 = _mm_slli_epi16(in3, 2); in4 = _mm_slli_epi16(in4, 2); in5 = _mm_slli_epi16(in5, 2); in6 = _mm_slli_epi16(in6, 2); in7 = _mm_slli_epi16(in7, 2); // We do two passes, first the columns, then the rows. The results of the // first pass are transposed so that the same column code can be reused. The // results of the second pass are also transposed so that the rows (processed // as columns) are put back in row positions. for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; // Add/substract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); const __m128i q3 = _mm_add_epi16(in3, in4); const __m128i q4 = _mm_sub_epi16(in3, in4); const __m128i q5 = _mm_sub_epi16(in2, in5); const __m128i q6 = _mm_sub_epi16(in1, in6); const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res0 = _mm_packs_epi32(w0, w1); res4 = _mm_packs_epi32(w2, w3); res2 = _mm_packs_epi32(w4, w5); res6 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us into 32bits const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res1 = _mm_packs_epi32(w0, w1); res7 = _mm_packs_epi32(w2, w3); res5 = _mm_packs_epi32(w4, w5); res3 = _mm_packs_epi32(w6, w7); } // Transpose the 8x8. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 } } // Post-condition output and store it { // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); const __m128i sign_in1 = _mm_srai_epi16(in1, 15); const __m128i sign_in2 = _mm_srai_epi16(in2, 15); const __m128i sign_in3 = _mm_srai_epi16(in3, 15); const __m128i sign_in4 = _mm_srai_epi16(in4, 15); const __m128i sign_in5 = _mm_srai_epi16(in5, 15); const __m128i sign_in6 = _mm_srai_epi16(in6, 15); const __m128i sign_in7 = _mm_srai_epi16(in7, 15); in0 = _mm_sub_epi16(in0, sign_in0); in1 = _mm_sub_epi16(in1, sign_in1); in2 = _mm_sub_epi16(in2, sign_in2); in3 = _mm_sub_epi16(in3, sign_in3); in4 = _mm_sub_epi16(in4, sign_in4); in5 = _mm_sub_epi16(in5, sign_in5); in6 = _mm_sub_epi16(in6, sign_in6); in7 = _mm_sub_epi16(in7, sign_in7); in0 = _mm_srai_epi16(in0, 1); in1 = _mm_srai_epi16(in1, 1); in2 = _mm_srai_epi16(in2, 1); in3 = _mm_srai_epi16(in3, 1); in4 = _mm_srai_epi16(in4, 1); in5 = _mm_srai_epi16(in5, 1); in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results _mm_storeu_si128((__m128i *)(output + 0 * 8), in0); _mm_storeu_si128((__m128i *)(output + 1 * 8), in1); _mm_storeu_si128((__m128i *)(output + 2 * 8), in2); _mm_storeu_si128((__m128i *)(output + 3 * 8), in3); _mm_storeu_si128((__m128i *)(output + 4 * 8), in4); _mm_storeu_si128((__m128i *)(output + 5 * 8), in5); _mm_storeu_si128((__m128i *)(output + 6 * 8), in6); _mm_storeu_si128((__m128i *)(output + 7 * 8), in7); } }
/* ******************************************************************************** * * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients * of a 16x16 intra prediction macroblock, and then performs scaling. * prediction buffer * * @par Description: * The DC coefficients pass through a 2-stage inverse hadamard transform. * This inverse transformed content is scaled to based on Qp value. * * @param[in] pi2_src * input 4x4 block of DC coefficients * * @param[out] pi2_out * output 4x4 block * * @param[in] pu2_iscal_mat * pointer to scaling list * * @param[in] pu2_weigh_mat * pointer to weight matrix * * @param[in] u4_qp_div_6 * Floor (qp/6) * * @param[in] pi4_tmp * temporary buffer of size 1*16 * * @returns none * * @remarks none * ******************************************************************************* */ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6, WORD32* pi4_tmp) { int val = 0xFFFF; __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128(); __m128i src_r0, src_r1, src_r2, src_r3; __m128i temp0, temp1, temp2, temp3; __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6))); __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); __m128i mask = _mm_set1_epi32(val); UNUSED (pi4_tmp); mult_val = _mm_and_si128(mult_val, mask); src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); /* Perform Inverse transform */ /*-------------------------------------------------------------*/ /* IDCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 temp0 = _mm_add_epi32(src_r0, src_r3); temp1 = _mm_add_epi32(src_r1, src_r2); temp2 = _mm_sub_epi32(src_r1, src_r2); temp3 = _mm_sub_epi32(src_r0, src_r3); src_r0 = _mm_add_epi32(temp0, temp1); src_r1 = _mm_add_epi32(temp2, temp3); src_r2 = _mm_sub_epi32(temp0, temp1); src_r3 = _mm_sub_epi32(temp3, temp2); /*-------------------------------------------------------------*/ /* IDCT [ Vertical transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 temp0 = _mm_add_epi32(src_r0, src_r3); temp1 = _mm_add_epi32(src_r1, src_r2); temp2 = _mm_sub_epi32(src_r1, src_r2); temp3 = _mm_sub_epi32(src_r0, src_r3); src_r0 = _mm_add_epi32(temp0, temp1); src_r1 = _mm_add_epi32(temp2, temp3); src_r2 = _mm_sub_epi32(temp0, temp1); src_r3 = _mm_sub_epi32(temp3, temp2); src_r0 = _mm_and_si128(src_r0, mask); src_r1 = _mm_and_si128(src_r1, mask); src_r2 = _mm_and_si128(src_r2, mask); src_r3 = _mm_and_si128(src_r3, mask); src_r0 = _mm_madd_epi16(src_r0, mult_val); src_r1 = _mm_madd_epi16(src_r1, mult_val); src_r2 = _mm_madd_epi16(src_r2, mult_val); src_r3 = _mm_madd_epi16(src_r3, mult_val); //Scaling if(u4_qp_div_6 >= 6) { src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); } else { temp0 = _mm_add_epi32(src_r0, add_rshift); temp1 = _mm_add_epi32(src_r1, add_rshift); temp2 = _mm_add_epi32(src_r2, add_rshift); temp3 = _mm_add_epi32(src_r3, add_rshift); src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); } src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); }
/* motion templates */ CV_IMPL void cvUpdateMotionHistory( const void* silhouette, void* mhimg, double timestamp, double mhi_duration ) { CvMat silhstub, *silh = cvGetMat(silhouette, &silhstub); CvMat mhistub, *mhi = cvGetMat(mhimg, &mhistub); if( !CV_IS_MASK_ARR( silh )) CV_Error( CV_StsBadMask, "" ); if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 ) CV_Error( CV_StsUnsupportedFormat, "" ); if( !CV_ARE_SIZES_EQ( mhi, silh )) CV_Error( CV_StsUnmatchedSizes, "" ); CvSize size = cvGetMatSize( mhi ); if( CV_IS_MAT_CONT( mhi->type & silh->type )) { size.width *= size.height; size.height = 1; } float ts = (float)timestamp; float delbound = (float)(timestamp - mhi_duration); int x, y; #if CV_SSE2 volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); #endif for( y = 0; y < size.height; y++ ) { const uchar* silhData = silh->data.ptr + silh->step*y; float* mhiData = (float*)(mhi->data.ptr + mhi->step*y); x = 0; #if CV_SSE2 if( useSIMD ) { __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound); for( ; x <= size.width - 8; x += 8 ) { __m128i z = _mm_setzero_si128(); __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z); __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z)); __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4); __m128 fz = _mm_setzero_ps(); v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4)); v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4)); __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz)); __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz)); v0 = _mm_xor_ps(v0, m0); v1 = _mm_xor_ps(v1, m1); _mm_storeu_ps(mhiData + x, v0); _mm_storeu_ps(mhiData + x + 4, v1); } } #endif for( ; x < size.width; x++ ) { float val = mhiData[x]; val = silhData[x] ? ts : val < delbound ? 0 : val; mhiData[x] = val; } } }
/// CURRENTLY SAME CODE AS SCALAR !! /// REPLACE HERE WITH SSE intrinsics static void partialButterflyInverse16_simd(short *src, short *dst, int shift) { int add = 1<<(shift-1); //we cast the original 16X16 matrix to an SIMD vector type __m128i *g_aiT16_vec = (__m128i *)g_aiT16; //We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type //We also cast the output to an SIMD vector type __m128i *in_vec = (__m128i *) src; __m128i *out_vec = (__m128i *) dst; //we declare an 8X8 array and cast it to an SIMD vector type short gt[8][8] __attribute__ ((aligned (16))); __m128i *gt_vec = (__m128i *)gt; //we declare an 16X16 array and cast it to an SIMD vector type short random[16][16] __attribute__ ((aligned (16))); __m128i *random_vec = (__m128i *)random; trans_g_aiT16(g_aiT16_vec,gt_vec); tranpose8x8(in_vec,2, random_vec,0); tranpose8x8(in_vec,3, random_vec,8); tranpose8x8(in_vec,0, random_vec,16); tranpose8x8(in_vec,1, random_vec,24); for (int j=0; j<16; j++) { /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ __m128i I0 = _mm_load_si128 (&random_vec[j]); __m128i II0 = _mm_load_si128 (&random_vec[j+16]); // for (int k=0; k<8; k++) //here we are loading up the transposed values in the initial matrix //multiplying it with the input numbers to produce intermediate 32-bit integers // we then sum up adjacent pairs of 32-bit integers and store them in the destination register __m128i I1 = _mm_load_si128 (>_vec[0]); __m128i I2 = _mm_madd_epi16 (I1, I0); __m128i I3 = _mm_load_si128 (>_vec[1]); __m128i I4 = _mm_madd_epi16 (I3, I0); __m128i I5 = _mm_load_si128 (>_vec[2]); __m128i I6 = _mm_madd_epi16 (I5, I0); __m128i I7 = _mm_load_si128 (>_vec[3]); __m128i I8 = _mm_madd_epi16 (I7, I0); __m128i I9 = _mm_load_si128 (>_vec[4]); __m128i I10 = _mm_madd_epi16 (I9, I0); __m128i I11 = _mm_load_si128 (>_vec[5]); __m128i I12 = _mm_madd_epi16 (I11, I0); __m128i I13 = _mm_load_si128 (>_vec[6]); __m128i I14 = _mm_madd_epi16 (I13, I0); __m128i I15 = _mm_load_si128 (>_vec[7]); __m128i I16 = _mm_madd_epi16 (I15, I0); //horizontally add the partial results obtained from thee previous step __m128i A1 =_mm_hadd_epi32 (I2, I4); __m128i A2 =_mm_hadd_epi32 (I6, I8); __m128i R1 =_mm_hadd_epi32 (A1, A2); __m128i A3 =_mm_hadd_epi32 (I10, I12); __m128i A4 =_mm_hadd_epi32 (I14, I16); __m128i R2 =_mm_hadd_epi32 (A3, A4); // O[k] = T[0]+T[1]+T[2]+T[3]; // for (int k=0; k<4; k++) // { //load the original matrix values, multiply it with the random values //store the low bits to I2 and the hi bits to I3 I1 = _mm_load_si128 (>_vec[8]); I2 = _mm_mullo_epi16 (I1, II0); I3 = _mm_mulhi_epi16 (I1, II0); __m128i lowI23 = _mm_unpacklo_epi16(I2,I3); __m128i hiI23 = _mm_unpackhi_epi16(I2,I3); __m128i temp1 = _mm_add_epi32(lowI23,hiI23); __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23); I4 = _mm_load_si128 (>_vec[9]); I5 = _mm_mullo_epi16 (I4, II0); I6 = _mm_mulhi_epi16 (I4, II0); __m128i lowI56 = _mm_unpacklo_epi16(I5,I6); __m128i hiI56 = _mm_unpackhi_epi16(I5,I6); __m128i temp2 = _mm_add_epi32(lowI56,hiI56); __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56); I7 = _mm_load_si128 (>_vec[10]); I8 = _mm_mullo_epi16 (I7, II0); I9 = _mm_mulhi_epi16 (I7, II0); __m128i lowI89 = _mm_unpacklo_epi16(I8,I9); __m128i hiI89 = _mm_unpackhi_epi16(I8,I9); __m128i temp3 = _mm_add_epi32(lowI89,hiI89); __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89); I10 = _mm_load_si128 (>_vec[11]); I11 = _mm_mullo_epi16 (I10, II0); I12 = _mm_mulhi_epi16 (I10, II0); __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12); __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12); __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112); __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112); __m128i A5 =_mm_hadd_epi32 (temp1, temp2); __m128i A6 =_mm_hadd_epi32 (temp3, temp4); __m128i R3 =_mm_hadd_epi32 (A5, A6); __m128i A7 =_mm_hadd_epi32 (temp8, temp7); __m128i A8 =_mm_hadd_epi32 (temp6, temp5); __m128i R4 =_mm_hadd_epi32 (A7, A8); /////////////////////////// __m128i add_reg = _mm_set1_epi32(add); __m128i sum_vec0 = _mm_add_epi32(R3,R1); sum_vec0 = _mm_add_epi32(sum_vec0,add_reg); sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right __m128i sum_vec1 = _mm_add_epi32(R4,R2); sum_vec1 = _mm_add_epi32(sum_vec1,add_reg); sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j], finalres0); __m128i sum_vec2 = _mm_sub_epi32(R4, R2); sum_vec2 = _mm_add_epi32(sum_vec2,add_reg); sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right __m128i sum_vec3 = _mm_sub_epi32(R3, R1); sum_vec3 = _mm_add_epi32(sum_vec3,add_reg); sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3); I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3); I7 = _mm_unpackhi_epi32(I5, I6); I8 = _mm_unpacklo_epi32(I5, I6); I9 = _mm_unpacklo_epi32(I7, I8); I10 = _mm_unpackhi_epi32(I7, I8); sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j+1], sum_vec3); } }
/*__forceinline*/ bool Cmp_ClutBuffer_SavedClut<u16>(u16* saved_clut, u32 csa, u32 clutsize) { assert((clutsize&31) == 0); #ifdef ZEROGS_SSE2 __m128i zero_128 = _mm_setzero_si128(); #endif u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2 // which side to cmp u32 clutsize_right; u32 clutsize_left; if (csa < 16) { clutsize_right = min(clutsize, (16-csa)*32); clutsize_left = clutsize - clutsize_right; } else { clutsize_right = 0; clutsize_left = clutsize; } while (clutsize_right > 0) { #ifdef ZEROGS_SSE2 // only lower 16 bits of dword are valid __m128i clut_0 = _mm_load_si128((__m128i*)clut); __m128i clut_1 = _mm_load_si128((__m128i*)clut+1); __m128i clut_2 = _mm_load_si128((__m128i*)clut+2); __m128i clut_3 = _mm_load_si128((__m128i*)clut+3); // value must converted to 32 bits __m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut); __m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1); __m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_0, zero_128), clut_0); __m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_0, zero_128), clut_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_1, zero_128), clut_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_1, zero_128), clut_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); // only lower 16bits must be checked if ((result_int&0x3333) != 0x3333) return true; #else for (int i = 0; i < 16; ++i) if (saved_clut[i] != clut[2*i]) return true; #endif saved_clut += 16; clut += 32; clutsize_right -= 32; } if(csa < 16) { // go back to the base before processing left clut column clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2 } while (clutsize_left > 0) { #ifdef ZEROGS_SSE2 // only higher 16 bits of dword are valid __m128i clut_0 = _mm_load_si128((__m128i*)clut); __m128i clut_1 = _mm_load_si128((__m128i*)clut+1); __m128i clut_2 = _mm_load_si128((__m128i*)clut+2); __m128i clut_3 = _mm_load_si128((__m128i*)clut+3); // value must converted to 32 bits (with 0 in lower 16 bits) __m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut); __m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1); __m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_0), clut_0); __m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_0), clut_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_1), clut_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_1), clut_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); // only higher 16bits must be checked if ((result_int&0xCCCC) != 0xCCCC) return true; #else // Note +1 because we change higher 16 bits for (int i = 0; i < 16; ++i) if (saved_clut[i] != clut[2*i+1]) return true; #endif saved_clut += 16; clut += 32; clutsize_left -= 32; } return false; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i three = _mm_set1_epi16(3); // Load, combine and tranpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); // b0_extra = (a0 != 0); const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); const __m128i b0_base = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // b = abs(b) + 3 A_b0 = _mm_add_epi16(A_b0, three); A_b2 = _mm_add_epi16(A_b2, three); B_b0 = _mm_add_epi16(B_b0, three); B_b2 = _mm_add_epi16(B_b2, three); // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 // b = (abs(b) + 3) >> 3 A_b0 = _mm_srai_epi16(A_b0, 3); A_b2 = _mm_srai_epi16(A_b2, 3); B_b0 = _mm_srai_epi16(B_b0, 3); B_b2 = _mm_srai_epi16(B_b2, 3); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }
SIMD_INLINE __m128i MulDiv16(__m128i dividend, __m128i divisor, const __m128 & KF_255_DIV_6) { const __m128i quotientLo = MulDiv32(_mm_unpacklo_epi16(dividend, K_ZERO), _mm_unpacklo_epi16(divisor, K_ZERO), KF_255_DIV_6); const __m128i quotientHi = MulDiv32(_mm_unpackhi_epi16(dividend, K_ZERO), _mm_unpackhi_epi16(divisor, K_ZERO), KF_255_DIV_6); return _mm_packs_epi32(quotientLo, quotientHi); }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, const int ref_stride) { const __m128i one = _mm_set1_epi16(1); const int stride = ref_stride << 3; int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); __m128i p0 = _mm_loadu_si128((const __m128i *)pred); __m128i t0, t1, t2, t3; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t2 = _mm_unpacklo_epi16(s4, s5); t3 = _mm_unpacklo_epi16(s6, s7); t0 = _mm_unpacklo_epi32(t0, t1); t2 = _mm_unpacklo_epi32(t2, t3); t0 = _mm_unpacklo_epi64(t0, t2); p0 = _mm_adds_epu16(t0, p0); p0 = _mm_adds_epu16(p0, one); p0 = _mm_srli_epi16(p0, 1); _mm_storeu_si128((__m128i *)(comp_pred), p0); comp_pred += 8; pred += 8; ref += 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); __m128i t0, t1; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t0 = _mm_unpacklo_epi32(t0, t1); p0 = _mm_adds_epu16(t0, p0); p0 = _mm_adds_epu16(p0, one); p0 = _mm_srli_epi16(p0, 1); _mm_storel_epi64((__m128i *)(comp_pred), p0); comp_pred += 4; pred += 4; ref += 4 * 8; } ref += stride - (width << 3); } } }
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int y, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; } } }
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; __m128i packed_out; // Load all inputs. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // coeff = abs(in) __m128i coeff0 = _mm_abs_epi16(in0); __m128i coeff8 = _mm_abs_epi16(in8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // put sign back out0 = _mm_sign_epi16(out0, in0); out8 = _mm_sign_epi16(out8, in8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. The re-ordering is: // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15 // There's only two misplaced entries ([8] and [7]) that are crossing the // reg's boundaries. // We use pshufb instead of pshuflo/pshufhi. { const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6); const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1); const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo); const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7); const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1); const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi); const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8); const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7); _mm_storeu_si128((__m128i*)&out[0], out_z0); _mm_storeu_si128((__m128i*)&out[8], out_z8); packed_out = _mm_packs_epi16(out_z0, out_z8); } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
static void tranpose8x8(__m128i *input,int i_indx, __m128i *Transposed,int t_indx) { __m128i a; __m128i b; __m128i c; __m128i d; __m128i e; __m128i f; __m128i g; __m128i h; __m128i temp1; __m128i temp2; __m128i temp3; __m128i temp4; __m128i temp5; __m128i temp6; __m128i temp7; __m128i temp8; __m128i temp9; __m128i temp10; __m128i temp11; __m128i temp12; __m128i temp13; __m128i temp14; __m128i temp15; __m128i temp16; __m128i T0; __m128i T1; __m128i T2; __m128i T3; __m128i T4; __m128i T5; __m128i T6; __m128i T7; a = _mm_load_si128(&input[i_indx]); b = _mm_load_si128(&input[i_indx+4 ]); c = _mm_load_si128(&input[i_indx+8 ]); d = _mm_load_si128(&input[i_indx+12]); e = _mm_load_si128(&input[i_indx+16]); f = _mm_load_si128(&input[i_indx+20]); g = _mm_load_si128(&input[i_indx+24]); h = _mm_load_si128(&input[i_indx+28]); temp1 = _mm_unpacklo_epi16(a, b); //a03b03 temp2 = _mm_unpacklo_epi16(c, d); temp3 = _mm_unpacklo_epi16(e, f); temp4 = _mm_unpacklo_epi16(g, h); temp5 = _mm_unpackhi_epi16(a, b); temp6 = _mm_unpackhi_epi16(c, d); temp7 = _mm_unpackhi_epi16(e, f); temp8 = _mm_unpackhi_epi16(g, h); temp9 = _mm_unpacklo_epi32(temp1, temp2); //a01b01c01d01 temp10 = _mm_unpackhi_epi32(temp1, temp2); temp11 = _mm_unpacklo_epi32(temp3, temp4); temp12 = _mm_unpackhi_epi32(temp3, temp4); temp13 = _mm_unpacklo_epi32(temp5, temp6); temp14 = _mm_unpackhi_epi32(temp5, temp6); temp15 = _mm_unpacklo_epi32(temp7, temp8); temp16 = _mm_unpackhi_epi32(temp7, temp8); T0 = _mm_unpacklo_epi64(temp9, temp11); //a0b0c0d0e0f0g0h0 T1 = _mm_unpackhi_epi64(temp9, temp11); T2 = _mm_unpacklo_epi64(temp10, temp12); T3 = _mm_unpackhi_epi64(temp10, temp12); T4 = _mm_unpacklo_epi64(temp13, temp15); T5 = _mm_unpackhi_epi64(temp13, temp15); T6 = _mm_unpacklo_epi64(temp14, temp16); T7 = _mm_unpackhi_epi64(temp14, temp16); _mm_store_si128(&Transposed[t_indx], T0); //store transposed 8X8 matrix _mm_store_si128(&Transposed[t_indx+1], T1); _mm_store_si128(&Transposed[t_indx+2], T2); _mm_store_si128(&Transposed[t_indx+3], T3); _mm_store_si128(&Transposed[t_indx+4], T4); _mm_store_si128(&Transposed[t_indx+5], T5); _mm_store_si128(&Transposed[t_indx+6], T6); _mm_store_si128(&Transposed[t_indx+7], T7); }
/* Input image must be large enough to have valid pixels for the offset (dx,dy). I.e., with (dx,dy)=(-10,8), x-value up to -10 and y-values up to (h-1)+8 will be accessed. The integral image will be access with (x,y) in [-1,w)x[-1,h). Note also that we use 32bit for the integral image even though the values may overflow that range. However, the modulo-arithmetic used when computing the block sums later will be still correct when the block size is not too large. */ static void buildIntegralImage_SSE(uint32_t* integral_image, int integral_stride, const uint8_t* current_image, int current_image_stride, const uint8_t* compare_image, int compare_image_stride, int w,int hStart, int hEnd, int dx,int dy) { const __m128i zero = _mm_set1_epi8(0); memset(integral_image -1 -integral_stride, 0, (w+1)*sizeof(uint32_t)); for (int y=hStart;y<hEnd;y++) { const uint8_t* p1 = current_image + y *current_image_stride; const uint8_t* p2 = compare_image + (y+dy)*compare_image_stride + dx; uint32_t* out = integral_image + y*integral_stride-1; __m128i prevadd = _mm_set1_epi32(0); const int pixels_step = 16; *out++ = 0; for (int x=0 ; x<w ; x+=pixels_step) { __m128i pa, pb; __m128i pla, plb; __m128i ldiff, lldiff, lhdiff; __m128i ltmp,htmp; __m128i ladd,hadd; __m128i pha,phb; __m128i hdiff,hldiff,hhdiff; __m128i l2tmp,h2tmp; pa = _mm_loadu_si128((__m128i*)p1); pb = _mm_loadu_si128((__m128i*)p2); pla = _mm_unpacklo_epi8(pa,zero); plb = _mm_unpacklo_epi8(pb,zero); ldiff = _mm_sub_epi16(pla,plb); ldiff = _mm_mullo_epi16(ldiff,ldiff); lldiff = _mm_unpacklo_epi16(ldiff,zero); lhdiff = _mm_unpackhi_epi16(ldiff,zero); ltmp = _mm_slli_si128(lldiff, 4); lldiff = _mm_add_epi32(lldiff, ltmp); ltmp = _mm_slli_si128(lldiff, 8); lldiff = _mm_add_epi32(lldiff, ltmp); lldiff = _mm_add_epi32(lldiff, prevadd); ladd = _mm_shuffle_epi32(lldiff, 0xff); htmp = _mm_slli_si128(lhdiff, 4); lhdiff = _mm_add_epi32(lhdiff, htmp); htmp = _mm_slli_si128(lhdiff, 8); lhdiff = _mm_add_epi32(lhdiff, htmp); lhdiff = _mm_add_epi32(lhdiff, ladd); prevadd = _mm_shuffle_epi32(lhdiff, 0xff); _mm_store_si128((__m128i*)(out), lldiff); _mm_store_si128((__m128i*)(out+4),lhdiff); pha = _mm_unpackhi_epi8(pa,zero); phb = _mm_unpackhi_epi8(pb,zero); hdiff = _mm_sub_epi16(pha,phb); hdiff = _mm_mullo_epi16(hdiff,hdiff); hldiff = _mm_unpacklo_epi16(hdiff,zero); hhdiff = _mm_unpackhi_epi16(hdiff,zero); l2tmp = _mm_slli_si128(hldiff, 4); hldiff = _mm_add_epi32(hldiff, l2tmp); l2tmp = _mm_slli_si128(hldiff, 8); hldiff = _mm_add_epi32(hldiff, l2tmp); hldiff = _mm_add_epi32(hldiff, prevadd); hadd = _mm_shuffle_epi32(hldiff, 0xff); h2tmp = _mm_slli_si128(hhdiff, 4); hhdiff = _mm_add_epi32(hhdiff, h2tmp); h2tmp = _mm_slli_si128(hhdiff, 8); hhdiff = _mm_add_epi32(hhdiff, h2tmp); hhdiff = _mm_add_epi32(hhdiff, hadd); prevadd = _mm_shuffle_epi32(hhdiff, 0xff); _mm_store_si128((__m128i*)(out+8), hldiff); _mm_store_si128((__m128i*)(out+12),hhdiff); out+=pixels_step; p1 +=pixels_step; p2 +=pixels_step; } if (y>0) { out = integral_image + y*integral_stride; for (int x=0 ; x<w ; x+=pixels_step) { *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), *(__m128i*)(out)); *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride), *(__m128i*)(out+4)); *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride), *(__m128i*)(out+8)); *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride), *(__m128i*)(out+12)); out += 4*4; } } } }
static void trans_g_aiT16(__m128i *input, __m128i *Transposed) { __m128i a; __m128i b; __m128i c; __m128i d; __m128i e; __m128i f; __m128i g; __m128i h; __m128i temp1; __m128i temp2; __m128i temp3; __m128i temp4; __m128i temp5; __m128i temp6; __m128i temp7; __m128i temp8; __m128i temp9; __m128i temp10; __m128i temp11; __m128i temp12; __m128i temp13; __m128i temp14; __m128i temp15; __m128i temp16; __m128i T0; __m128i T1; __m128i T2; __m128i T3; __m128i T4; __m128i T5; __m128i T6; __m128i T7; a = _mm_load_si128(&input[2]); b = _mm_load_si128(&input[6]); c = _mm_load_si128(&input[10]); d = _mm_load_si128(&input[14]); e = _mm_load_si128(&input[18]); f = _mm_load_si128(&input[22]); g = _mm_load_si128(&input[26]); h = _mm_load_si128(&input[30]); //store 128 bits of integer data into the memory address given _mm_store_si128(&Transposed[0], a); //store transposed 8X8 matrix _mm_store_si128(&Transposed[1], b); _mm_store_si128(&Transposed[2], c); _mm_store_si128(&Transposed[3], d); _mm_store_si128(&Transposed[4], e); _mm_store_si128(&Transposed[5], f); _mm_store_si128(&Transposed[6], g); _mm_store_si128(&Transposed[7], h); //load matrix input[0][0],[2][0]... a = _mm_load_si128(&input[0]); b = _mm_load_si128(&input[4]); c = _mm_load_si128(&input[8]); d = _mm_load_si128(&input[12]); e = _mm_load_si128(&input[16]); f = _mm_load_si128(&input[20]); g = _mm_load_si128(&input[24]); h = _mm_load_si128(&input[28]); temp1 = _mm_unpacklo_epi16(a, b); temp2 = _mm_unpacklo_epi16(c, d); temp3 = _mm_unpacklo_epi16(e, f); temp4 = _mm_unpacklo_epi16(g, h); temp5 = _mm_unpackhi_epi16(a, b); temp6 = _mm_unpackhi_epi16(c, d); temp7 = _mm_unpackhi_epi16(e, f); temp8 = _mm_unpackhi_epi16(g, h); temp9 = _mm_unpacklo_epi32(temp1, temp2); temp10 = _mm_unpackhi_epi32(temp1, temp2); temp11 = _mm_unpacklo_epi32(temp3, temp4); temp12 = _mm_unpackhi_epi32(temp3, temp4); temp13 = _mm_unpacklo_epi32(temp5, temp6); temp14 = _mm_unpackhi_epi32(temp5, temp6); temp15 = _mm_unpacklo_epi32(temp7, temp8); temp16 = _mm_unpackhi_epi32(temp7, temp8); T0 = _mm_unpacklo_epi64(temp9, temp11); T1 = _mm_unpackhi_epi64(temp9, temp11); T2 = _mm_unpacklo_epi64(temp10, temp12); T3 = _mm_unpackhi_epi64(temp10, temp12); _mm_store_si128(&Transposed[8], T0); //store transposed 8X8 matrix _mm_store_si128(&Transposed[9], T1); _mm_store_si128(&Transposed[10], T2); _mm_store_si128(&Transposed[11], T3); }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352); const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. -> 00 01 02 03 00 00 00 00 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Unpack and shuffle // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); // 00 01 10 11 02 03 12 13 // 20 21 30 31 22 23 32 33 const __m128i shuf01_p = _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i shuf23_p = _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 const __m128i a01 = _mm_add_epi16(s01, s32); const __m128i a32 = _mm_sub_epi16(s01, s32); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); v01 = _mm_unpacklo_epi32(s_lo, s_hi); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); _mm_storeu_si128((__m128i*)&out[0], d0_g1); _mm_storeu_si128((__m128i*)&out[8], d2_f3); } }
__m64 interpolvline128_3(__m128i* temp){ __m128i xmm6; __m64 ret; __m128i xmm7 = _mm_setzero_si128(); __m128i xmm0 = _mm_load_si128(temp++); __m128i xmm1 = _mm_load_si128(temp++); __m128i xmm2 = _mm_load_si128(temp++); __m128i xmm3 = _mm_load_si128(temp++); __m128i xmm4 = _mm_load_si128(temp++); __m128i xmm5 = _mm_load_si128(temp); xmm1 = _mm_add_epi16(xmm1,xmm4); xmm0 = _mm_add_epi16(xmm0,xmm5); xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB); xmm4 = _mm_mullo_epi16(xmm1, xmm6); xmm5 = _mm_mulhi_epi16(xmm1, xmm6); xmm1 = _mm_unpacklo_epi16(xmm4, xmm5); xmm6 = _mm_unpackhi_epi16(xmm4, xmm5); xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014); xmm5 = _mm_add_epi16(xmm2,xmm3); xmm4 = _mm_mullo_epi16(xmm5, xmm7); xmm5 = _mm_mulhi_epi16(xmm5, xmm7); xmm7 = _mm_unpacklo_epi16(xmm4, xmm5); xmm4 = _mm_unpackhi_epi16(xmm4, xmm5); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm6); xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001); xmm6 = _mm_mulhi_epi16(xmm0, xmm6); xmm1 = _mm_unpacklo_epi16(xmm0, xmm6); xmm6 = _mm_unpackhi_epi16(xmm0, xmm6); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm6); xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm1); xmm5 = _mm_setzero_si128(); xmm7 = _mm_srli_epi32(xmm7, 10); xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values xmm7 = _mm_slli_epi32(xmm7,16); xmm7 = _mm_srli_epi32(xmm7,16); xmm4 = _mm_srli_epi32(xmm4, 10); xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values xmm4 = _mm_slli_epi32(xmm4,16); xmm4 = _mm_srli_epi32(xmm4,16); xmm6 = _mm_packs_epi32(xmm7, xmm4); xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010); xmm2 = _mm_add_epi16(xmm2,xmm1); xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values xmm2 = _mm_srli_epi16(xmm2,5); xmm3 = _mm_add_epi16(xmm3,xmm1); xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values xmm3 = _mm_srli_epi16(xmm3,5); xmm2 = _mm_packus_epi16(xmm2,xmm5); xmm3 = _mm_packus_epi16(xmm3,xmm5); xmm6 = _mm_packus_epi16(xmm6,xmm5); xmm7 = _mm_unpacklo_epi8(xmm2,xmm6); xmm4 = _mm_unpacklo_epi8(xmm6,xmm3); xmm6 = _mm_avg_epu8(xmm4,xmm7); xmm6 = _mm_srli_epi16(xmm6,8); xmm6 = _mm_packus_epi16(xmm6,xmm5); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }