void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); __m128i in0, in1, in2, in3; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); in2 = _mm_slli_epi16(in2, 4); in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); in0 = _mm_add_epi16(in0, mask); in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); const __m128i r3 = _mm_sub_epi16(in0, in3); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); // Combine and transpose const __m128i res0 = _mm_packs_epi32(w0, w2); const __m128i res1 = _mm_packs_epi32(w4, w6); // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 if (0 == pass) { // Extract values in the high part for second pass as transform code // only uses the first four values. in1 = _mm_unpackhi_epi64(in0, in0); in3 = _mm_unpackhi_epi64(in2, in2); } else { // Post-condition output and store it (v + 1) >> 2, taking advantage // of the fact 1/3 are stored just after 0/2. __m128i out01 = _mm_add_epi16(in0, kOne); __m128i out23 = _mm_add_epi16(in2, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } } }

void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; (void)scan_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i zbin; __m128i round, quant, dequant, shift; { __m128i coeff0, coeff1; // Setup global values { __m128i pw_1; zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); shift = _mm_unpackhi_epi64(shift, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_coefficients(zero, dqcoeff_ptr + n_coeffs); store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); store_coefficients(zero, qcoeff_ptr + n_coeffs); store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }

static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two transforms // in parallel). In the case of only one transform, the second half of the // vectors will just contain random value we'll never use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((__m128i*)&in[0]); in1 = _mm_loadl_epi64((__m128i*)&in[4]); in2 = _mm_loadl_epi64((__m128i*)&in[8]); in3 = _mm_loadl_epi64((__m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'dst' and store. { const __m128i zero = _mm_setzero_si128(); // Load the reference(s). __m128i dst0, dst1, dst2, dst3; if (do_two) { // Load eight bytes/pixels per line. dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); } else { // Load four bytes/pixels per line. dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform(s). dst0 = _mm_add_epi16(dst0, T0); dst1 = _mm_add_epi16(dst1, T1); dst2 = _mm_add_epi16(dst2, T2); dst3 = _mm_add_epi16(dst3, T3); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); } } }

rfx_dwt_2d_decode_block_vert_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int x, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; __m128i l_n; __m128i h_n; __m128i tmp_n; __m128i h_n_m; __m128i dst_n; __m128i dst_n_m; __m128i dst_n_p; int total_width = subband_width + subband_width; /* Even coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; if (n == 0) tmp_n = _mm_add_epi16(tmp_n, h_n); else { h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); tmp_n = _mm_add_epi16(tmp_n, h_n_m); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } h_ptr = h; dst_ptr = dst + total_width; /* Odd coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); h_n = _mm_slli_epi16(h_n, 1); tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = _mm_add_epi16(tmp_n, dst_n_m); else { dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); tmp_n = _mm_add_epi16(tmp_n, dst_n_p); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_add_epi16(tmp_n, h_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } }

static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) { int SMAG = 1 << SMAGL; // calculate displacement __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0))); __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0))); __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1)); __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1)); above = _mm_unpacklo_epi8(above, zero); below = _mm_unpacklo_epi8(below, zero); left = _mm_unpacklo_epi8(left, zero); right = _mm_unpacklo_epi8(right, zero); __m128i h = _mm_sub_epi16(left, right); __m128i v = _mm_sub_epi16(above, below); h = _mm_slli_epi16(h, 7); v = _mm_slli_epi16(v, 7); h = _mm_mulhi_epi16(h, depth); v = _mm_mulhi_epi16(v, depth); v = _mm_max_epi16(v, y_limit_min); v = _mm_min_epi16(v, y_limit_max); __m128i remainder_h = h; __m128i remainder_v = v; if (SMAGL) { remainder_h = _mm_slli_epi16(remainder_h, SMAGL); remainder_v = _mm_slli_epi16(remainder_v, SMAGL); } remainder_h = _mm_and_si128(remainder_h, word_127); remainder_v = _mm_and_si128(remainder_v, word_127); h = _mm_srai_epi16(h, 7 - SMAGL); v = _mm_srai_epi16(v, 7 - SMAGL); __m128i xx = _mm_set1_epi32(x << SMAGL); xx = _mm_packs_epi32(xx, xx); h = _mm_adds_epi16(h, xx); remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h)); remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h); h = _mm_max_epi16(h, x_limit_min); h = _mm_min_epi16(h, x_limit_max); // h and v contain the displacement now. __m128i disp_lo = _mm_unpacklo_epi16(v, h); __m128i disp_hi = _mm_unpackhi_epi16(v, h); disp_lo = _mm_madd_epi16(disp_lo, one_stride); disp_hi = _mm_madd_epi16(disp_hi, one_stride); __m128i line0 = _mm_setzero_si128(); __m128i line1 = _mm_setzero_si128(); int offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7); __m128i left0 = _mm_and_si128(line0, word_255); __m128i left1 = _mm_and_si128(line1, word_255); __m128i right0 = _mm_srli_epi16(line0, 8); __m128i right1 = _mm_srli_epi16(line1, 8); left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h)); left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h)); right0 = _mm_mullo_epi16(right0, remainder_h); right1 = _mm_mullo_epi16(right1, remainder_h); line0 = _mm_add_epi16(left0, right0); line1 = _mm_add_epi16(left1, right1); line0 = _mm_add_epi16(line0, word_64); line1 = _mm_add_epi16(line1, word_64); line0 = _mm_srai_epi16(line0, 7); line1 = _mm_srai_epi16(line1, 7); line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v)); line1 = _mm_mullo_epi16(line1, remainder_v); __m128i result = _mm_add_epi16(line0, line1); result = _mm_add_epi16(result, word_64); result = _mm_srai_epi16(result, 7); result = _mm_packus_epi16(result, result); _mm_storel_epi64((__m128i *)(dstp + x), result); }

// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); // Load, combine and transpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }

void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor ) { __m128i t0, t1, t3, t4, t6, t7; // get bounding box // ---------------- // load the first row t0 = _mm_load_si128 ( (__m128i*) colorBlock ); t1 = _mm_load_si128 ( (__m128i*) colorBlock ); __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) ); // Minimum of Packed Unsigned Byte Integers t0 = _mm_min_epu8 ( t0, t16); // Maximum of Packed Unsigned Byte Integers t1 = _mm_max_epu8 ( t1, t16); __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) ); t0 = _mm_min_epu8 ( t0, t32); t1 = _mm_max_epu8 ( t1, t32); __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) ); t0 = _mm_min_epu8 ( t0, t48); t1 = _mm_max_epu8 ( t1, t48); // Shuffle Packed Doublewords t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t3); t1 = _mm_max_epu8 ( t1, t4); // Shuffle Packed Low Words t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t6); t1 = _mm_max_epu8 ( t1, t7); // inset the bounding box // ---------------------- // Unpack Low Data //__m128i t66 = _mm_set1_epi8( 0 ); __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 ); t0 = _mm_unpacklo_epi8(t0, t66); t1 = _mm_unpacklo_epi8(t1, t66); // copy (movdqa) //__m128i t2 = _mm_load_si128 ( &t1 ); __m128i t2 = t1; // Subtract Packed Integers t2 = _mm_sub_epi16(t2, t0); // Shift Packed Data Right Logical t2 = _mm_srli_epi16(t2, INSET_SHIFT); // Add Packed Integers t0 = _mm_add_epi16(t0, t2); t1 = _mm_sub_epi16(t1, t2); // Pack with Unsigned Saturation t0 = _mm_packus_epi16(t0, t0); t1 = _mm_packus_epi16(t1, t1); // store bounding box extents // -------------------------- _mm_store_si128 ( (__m128i*) minColor, t0 ); _mm_store_si128 ( (__m128i*) maxColor, t1 ); }

// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }

void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i pa,pb,pc,smallest,nearest; __m128i c, b = zero, a, d = zero; png_debug(1, "in png_read_filter_row_paeth4_sse2"); rb = row_info->rowbytes+4; while (rb > 4) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); prev += 4; row += 4; rb -= 4; } }

void tuned_ConvertULY4ToRGB(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pYBegin, const uint8_t *pUBegin, const uint8_t *pVBegin, size_t cbWidth, ssize_t scbStride) { const int shift = 13; __m128i xy2rgb = _mm_set2_epi16_shift((-16 * C::Y2RGB + 0.5) / 0xff, C::Y2RGB, shift); __m128i vu2r = _mm_set2_epi16_shift(C::V2R, 0, shift); __m128i vu2g = _mm_set2_epi16_shift(C::V2G, C::U2G, shift); __m128i vu2b = _mm_set2_epi16_shift(0, C::U2B, shift); auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pDstBegin; p != pDstEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP * 4) { __m128i yy = _mm_cvtsi32_si128(*(const int *)y); __m128i uu = _mm_cvtsi32_si128(*(const int *)u); __m128i vv = _mm_cvtsi32_si128(*(const int *)v); __m128i xy = _mm_unpacklo_epi8(_mm_unpacklo_epi8(yy, _mm_setone_si128()), _mm_setzero_si128()); // 00 ff 00 Y3 00 ff 00 Y2 00 ff 00 Y1 00 ff 00 Y0 __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); _mm_storeu_si128((__m128i *)pp, bgrx); } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, rr)); __m128i bgr = _mm_shuffle_epi8(bgrx, _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0)); _mm_storeu_si128((__m128i *)pp, bgr); } #endif else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); _mm_storeu_si128((__m128i *)pp, xrgb); } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_setone_si128(), rr), _mm_unpacklo_epi8(gg, bb)); __m128i rgb = _mm_shuffle_epi8(xrgb, _mm_set_epi8(-1, -1, -1, -1, 15, 14, 13, 11, 10, 9, 7, 6, 5, 3, 2, 1)); _mm_storeu_si128((__m128i *)pp, rgb); } #endif y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i xy = _mm_cvtsi32_si128(*y | 0x00ff0000); __m128i uu = _mm_cvtsi32_si128(*u); __m128i vv = _mm_cvtsi32_si128(*v); __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); *(uint32_t *)pp = _mm_cvtsi128_si32(bgrx); } else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); *(uint32_t *)pp = _mm_cvtsi128_si32(xrgb); } else if (std::is_same<T, CBGRColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { *(pp + T::B) = (uint8_t)_mm_cvtsi128_si32(bb); *(pp + T::G) = (uint8_t)_mm_cvtsi128_si32(gg); *(pp + T::R) = (uint8_t)_mm_cvtsi128_si32(rr); } y += 1; u += 1; v += 1; } } }

static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }

static LW_FORCEINLINE void fill_rgb_buffer_sse41( BYTE *rgb_buffer, BYTE *lw48_ptr ) { static const USHORT LW_ALIGN(16) PW_32768[8] = { 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768 }; static const short LW_ALIGN(16) PW_28672[8] = { 28672, 28672, 28672, 28672, 28672, 28672, 28672, 28672 }; static const short LW_ALIGN(16) PW_9539[8] = { 9539, 9539, 9539, 9539, 9539, 9539, 9539, 9539 }; static const short LW_ALIGN(16) PW_13074[8] = { 13074, 13074, 13074, 13074, 13074, 13074, 13074, 13074 }; static const short LW_ALIGN(16) PW_16531[8] = { 16531, 16531, 16531, 16531, 16531, 16531, 16531, 16531 }; static const short LW_ALIGN(16) PW_M3203_M6808[8] = { -3203, -6808, -3203, -6808, -3203, -6808, -3203, -6808 }; static const int LW_ALIGN(16) PD_1_20[4] = { (1<<20), (1<<20), (1<<20), (1<<20) }; static const char LW_ALIGN(16) LW48_SHUFFLE[3][16] = { { 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11 }, { 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13 }, { 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15 } }; __m128i x0, x1, x2, x3, x4, x5, x6, x7; x5 = _mm_loadu_si128((__m128i *)(lw48_ptr + 0)); x6 = _mm_loadu_si128((__m128i *)(lw48_ptr + 16)); x7 = _mm_loadu_si128((__m128i *)(lw48_ptr + 32)); x0 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02); x0 = _mm_blend_epi16(x0, x7, 0x20+0x04); x1 = _mm_blend_epi16(x5, x6, 0x20+0x04); x1 = _mm_blend_epi16(x1, x7, 0x40+0x08+0x01); x2 = _mm_blend_epi16(x5, x6, 0x40+0x08+0x01); x2 = _mm_blend_epi16(x2, x7, 0x80+0x10+0x02); x0 = _mm_shuffle_epi8(x0, _mm_load_si128((__m128i*)LW48_SHUFFLE[0])); /* Y */ x1 = _mm_shuffle_epi8(x1, _mm_load_si128((__m128i*)LW48_SHUFFLE[1])); /* Cb */ x2 = _mm_shuffle_epi8(x2, _mm_load_si128((__m128i*)LW48_SHUFFLE[2])); /* Cr */ x0 = _mm_sub_epi16(x0, _mm_load_si128((__m128i*)PW_32768)); x1 = _mm_sub_epi16(x1, _mm_load_si128((__m128i*)PW_32768)); x2 = _mm_sub_epi16(x2, _mm_load_si128((__m128i*)PW_32768)); /* y_tmp = ((y - 4096) * 9539) */ /* = ((y - 32768) + (32768 - 4096)) * 9539 */ /* = ((y - 32768) * 9539 + 28672 * 9539 */ x3 = _mm_unpacklo_epi16(x0, _mm_load_si128((__m128i*)PW_28672)); x4 = _mm_unpackhi_epi16(x0, _mm_load_si128((__m128i*)PW_28672)); x3 = _mm_madd_epi16(x3, _mm_load_si128((__m128i*)PW_9539)); x4 = _mm_madd_epi16(x4, _mm_load_si128((__m128i*)PW_9539)); /* G = ((y_tmp + ((cb-32768) * -3203) + ((cr-32768) * -6808)) + (1<<20)) >> 21 */ x5 = _mm_unpacklo_epi16(x1, x2); x6 = _mm_unpackhi_epi16(x1, x2); x5 = _mm_madd_epi16(x5, _mm_load_si128((__m128i*)PW_M3203_M6808)); x6 = _mm_madd_epi16(x6, _mm_load_si128((__m128i*)PW_M3203_M6808)); x5 = _mm_add_epi32(x5, x3); x6 = _mm_add_epi32(x6, x4); x5 = _mm_add_epi32(x5, _mm_load_si128((__m128i*)PD_1_20)); x6 = _mm_add_epi32(x6, _mm_load_si128((__m128i*)PD_1_20)); x5 = _mm_srai_epi32(x5, 21); x6 = _mm_srai_epi32(x6, 21); x5 = _mm_packs_epi32(x5, x6); _mm_store_si128((__m128i*)(rgb_buffer + 16), x5); /* R = ((y_tmp + ((cr-32768) * 13074) + (1<<20)) >> 21 */ x0 = _mm_mullo_epi16(x2, _mm_load_si128((__m128i*)PW_13074)); x7 = _mm_mulhi_epi16(x2, _mm_load_si128((__m128i*)PW_13074)); x6 = _mm_unpacklo_epi16(x0, x7); x7 = _mm_unpackhi_epi16(x0, x7); x6 = _mm_add_epi32(x6, x3); x7 = _mm_add_epi32(x7, x4); x6 = _mm_add_epi32(x6, _mm_load_si128((__m128i*)PD_1_20)); x7 = _mm_add_epi32(x7, _mm_load_si128((__m128i*)PD_1_20)); x6 = _mm_srai_epi32(x6, 21); x7 = _mm_srai_epi32(x7, 21); x6 = _mm_packs_epi32(x6, x7); _mm_store_si128((__m128i*)(rgb_buffer + 32), x6); /* B = ((y_tmp + ((cb-32768) * 16531) + (1<<20)) >> 21 */ x2 = _mm_mullo_epi16(x1, _mm_load_si128((__m128i*)PW_16531)); x7 = _mm_mulhi_epi16(x1, _mm_load_si128((__m128i*)PW_16531)); x0 = _mm_unpacklo_epi16(x2, x7); x7 = _mm_unpackhi_epi16(x2, x7); x0 = _mm_add_epi32(x0, x3); x7 = _mm_add_epi32(x7, x4); x0 = _mm_add_epi32(x0, _mm_load_si128((__m128i*)PD_1_20)); x7 = _mm_add_epi32(x7, _mm_load_si128((__m128i*)PD_1_20)); x0 = _mm_srai_epi32(x0, 21); x7 = _mm_srai_epi32(x7, 21); x7 = _mm_packs_epi32(x0, x7); _mm_store_si128((__m128i*)(rgb_buffer + 0), x7); }

IVTCScore ComputeScanImprovement_X8R8G8B8_SSE2(const void *src1, const void *src2, ptrdiff_t srcpitch, uint32 w, uint32 h) { IVTCScore score = {0}; __m128i zero = _mm_setzero_si128(); uint32 w2 = w >> 1; static const __m128i mask = { -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0 }; bool firstfield = true; do { __m128i var = zero; __m128i varshift = zero; const uint8 *src1r0 = (const uint8 *)src1; const uint8 *src1r1 = src1r0 + srcpitch; const uint8 *src1r2 = src1r1 + srcpitch; const uint8 *src2r = (const uint8 *)src2 + srcpitch; for(uint32 x=0; x<w2; ++x) { __m128i rA = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r0), zero); __m128i rB = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r1), zero); __m128i rC = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r2), zero); __m128i rE = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src2r), zero); __m128i rAC = _mm_add_epi16(rA, rC); __m128i d1 = _mm_sub_epi16(rAC, _mm_add_epi16(rB, rB)); // combing in current frame __m128i d3 = _mm_sub_epi16(rAC, _mm_add_epi16(rE, rE)); // combing in merged frame d1 = _mm_and_si128(d1, mask); d3 = _mm_and_si128(d3, mask); var = _mm_add_epi32(var, _mm_madd_epi16(d1, d1)); varshift = _mm_add_epi32(varshift, _mm_madd_epi16(d3, d3)); src1r0 += 8; src1r1 += 8; src1r2 += 8; src2r += 8; } if (w & 1) { __m128i rA = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r0), zero); __m128i rB = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r1), zero); __m128i rC = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r2), zero); __m128i rE = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src2r), zero); __m128i rAC = _mm_add_epi16(rA, rC); __m128i d1 = _mm_sub_epi16(rAC, _mm_add_epi16(rB, rB)); // combing in current frame __m128i d3 = _mm_sub_epi16(rAC, _mm_add_epi16(rE, rE)); // combing in merged frame d1 = _mm_and_si128(d1, mask); d3 = _mm_and_si128(d3, mask); var = _mm_add_epi32(var, _mm_madd_epi16(d1, d1)); varshift = _mm_add_epi32(varshift, _mm_madd_epi16(d3, d3)); } src1 = (const uint8 *)src1 + srcpitch; src2 = (const uint8 *)src2 + srcpitch; var = _mm_add_epi32(var, _mm_shuffle_epi32(var, 0xee)); varshift = _mm_add_epi32(varshift, _mm_shuffle_epi32(varshift, 0xee)); var = _mm_add_epi32(var, _mm_shuffle_epi32(var, 0x55)); varshift = _mm_add_epi32(varshift, _mm_shuffle_epi32(varshift, 0x55)); uint32 ivar = _mm_cvtsi128_si32(var); uint32 ivarshift = _mm_cvtsi128_si32(varshift); if (firstfield) { score.mVar[0] += ivar; score.mVarShift[0] += ivarshift; } else { score.mVar[1] += ivar; score.mVarShift[1] += ivarshift; } firstfield = !firstfield; } while(--h); return score; }

void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // We process eight columns (transposed rows in second pass) at a time. int column_start; for (column_start = 0; column_start < 16; column_start += 8) { __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i in08, in09, in10, in11, in12, in13, in14, in15; __m128i input0, input1, input2, input3, input4, input5, input6, input7; __m128i step1_0, step1_1, step1_2, step1_3; __m128i step1_4, step1_5, step1_6, step1_7; __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; __m128i step3_0, step3_1, step3_2, step3_3; __m128i step3_4, step3_5, step3_6, step3_7; __m128i res00, res01, res02, res03, res04, res05, res06, res07; __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); in02 = _mm_slli_epi16(in02, 2); in03 = _mm_slli_epi16(in03, 2); in04 = _mm_slli_epi16(in04, 2); in05 = _mm_slli_epi16(in05, 2); in06 = _mm_slli_epi16(in06, 2); in07 = _mm_slli_epi16(in07, 2); in08 = _mm_slli_epi16(in08, 2); in09 = _mm_slli_epi16(in09, 2); in10 = _mm_slli_epi16(in10, 2); in11 = _mm_slli_epi16(in11, 2); in12 = _mm_slli_epi16(in12, 2); in13 = _mm_slli_epi16(in13, 2); in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); in02 = _mm_add_epi16(in02, kOne); in03 = _mm_add_epi16(in03, kOne); in04 = _mm_add_epi16(in04, kOne); in05 = _mm_add_epi16(in05, kOne); in06 = _mm_add_epi16(in06, kOne); in07 = _mm_add_epi16(in07, kOne); in08 = _mm_add_epi16(in08, kOne); in09 = _mm_add_epi16(in09, kOne); in10 = _mm_add_epi16(in10, kOne); in11 = _mm_add_epi16(in11, kOne); in12 = _mm_add_epi16(in12, kOne); in13 = _mm_add_epi16(in13, kOne); in14 = _mm_add_epi16(in14, kOne); in15 = _mm_add_epi16(in15, kOne); in00 = _mm_srai_epi16(in00, 2); in01 = _mm_srai_epi16(in01, 2); in02 = _mm_srai_epi16(in02, 2); in03 = _mm_srai_epi16(in03, 2); in04 = _mm_srai_epi16(in04, 2); in05 = _mm_srai_epi16(in05, 2); in06 = _mm_srai_epi16(in06, 2); in07 = _mm_srai_epi16(in07, 2); in08 = _mm_srai_epi16(in08, 2); in09 = _mm_srai_epi16(in09, 2); in10 = _mm_srai_epi16(in10, 2); in11 = _mm_srai_epi16(in11, 2); in12 = _mm_srai_epi16(in12, 2); in13 = _mm_srai_epi16(in13, 2); in14 = _mm_srai_epi16(in14, 2); in15 = _mm_srai_epi16(in15, 2); } in += 8; // Calculate input for the first 8 results. { input0 = _mm_add_epi16(in00, in15); input1 = _mm_add_epi16(in01, in14); input2 = _mm_add_epi16(in02, in13); input3 = _mm_add_epi16(in03, in12); input4 = _mm_add_epi16(in04, in11); input5 = _mm_add_epi16(in05, in10); input6 = _mm_add_epi16(in06, in09); input7 = _mm_add_epi16(in07, in08); } // Calculate input for the next 8 results. { step1_0 = _mm_sub_epi16(in07, in08); step1_1 = _mm_sub_epi16(in06, in09); step1_2 = _mm_sub_epi16(in05, in10); step1_3 = _mm_sub_epi16(in04, in11); step1_4 = _mm_sub_epi16(in03, in12); step1_5 = _mm_sub_epi16(in02, in13); step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } // Work on the first eight values; fdct8_1d(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); const __m128i q3 = _mm_add_epi16(input3, input4); const __m128i q4 = _mm_sub_epi16(input3, input4); const __m128i q5 = _mm_sub_epi16(input2, input5); const __m128i q6 = _mm_sub_epi16(input1, input6); const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res00 = _mm_packs_epi32(w0, w1); res08 = _mm_packs_epi32(w2, w3); res04 = _mm_packs_epi32(w4, w5); res12 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res02 = _mm_packs_epi32(w0, w1); res14 = _mm_packs_epi32(w2, w3); res10 = _mm_packs_epi32(w4, w5); res06 = _mm_packs_epi32(w6, w7); } } // Work on the next eight values; step1 -> odd_results { // step 2 { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_2 = _mm_packs_epi32(w0, w1); step2_3 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_5 = _mm_packs_epi32(w0, w1); step2_4 = _mm_packs_epi32(w2, w3); } // step 3 { step3_0 = _mm_add_epi16(step1_0, step2_3); step3_1 = _mm_add_epi16(step1_1, step2_2); step3_2 = _mm_sub_epi16(step1_1, step2_2); step3_3 = _mm_sub_epi16(step1_0, step2_3); step3_4 = _mm_sub_epi16(step1_7, step2_4); step3_5 = _mm_sub_epi16(step1_6, step2_5); step3_6 = _mm_add_epi16(step1_6, step2_5); step3_7 = _mm_add_epi16(step1_7, step2_4); } // step 4 { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_1 = _mm_packs_epi32(w0, w1); step2_2 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_6 = _mm_packs_epi32(w0, w1); step2_5 = _mm_packs_epi32(w2, w3); } // step 5 { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); step1_2 = _mm_sub_epi16(step3_3, step2_2); step1_3 = _mm_add_epi16(step3_3, step2_2); step1_4 = _mm_add_epi16(step3_4, step2_5); step1_5 = _mm_sub_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } // step 6 { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res01 = _mm_packs_epi32(w0, w1); res09 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res05 = _mm_packs_epi32(w0, w1); res13 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res11 = _mm_packs_epi32(w0, w1); res03 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res15 = _mm_packs_epi32(w0, w1); res07 = _mm_packs_epi32(w2, w3); } } // Transpose the results, do it as two 8x8 transposes. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); } { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 // Store results _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); } out += 8*16; } // Setup in/out for next pass. in = intermediate; out = output; } }

int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold) { const int K = 8, N = K*3 + 1; int k, v = ptr[0]; short d[N]; for( k = 0; k < N; k++ ) d[k] = (short)(v - ptr[pixel[k]]); #if CV_SSE2 __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); for( k = 0; k < 16; k += 8 ) { __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); __m128i a = _mm_min_epi16(v0, v1); __m128i b = _mm_max_epi16(v0, v1); v0 = _mm_loadu_si128((__m128i*)(d+k+3)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+4)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+5)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+6)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+7)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+8)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); v0 = _mm_loadu_si128((__m128i*)(d+k+9)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); } q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); threshold = (short)_mm_cvtsi128_si32(q0) - 1; #else int a0 = threshold; for( k = 0; k < 16; k += 2 ) { int a = std::min((int)d[k+1], (int)d[k+2]); a = std::min(a, (int)d[k+3]); if( a <= a0 ) continue; a = std::min(a, (int)d[k+4]); a = std::min(a, (int)d[k+5]); a = std::min(a, (int)d[k+6]); a = std::min(a, (int)d[k+7]); a = std::min(a, (int)d[k+8]); a0 = std::max(a0, std::min(a, (int)d[k])); a0 = std::max(a0, std::min(a, (int)d[k+9])); } int b0 = -a0; for( k = 0; k < 16; k += 2 ) { int b = std::max((int)d[k+1], (int)d[k+2]); b = std::max(b, (int)d[k+3]); b = std::max(b, (int)d[k+4]); b = std::max(b, (int)d[k+5]); if( b >= b0 ) continue; b = std::max(b, (int)d[k+6]); b = std::max(b, (int)d[k+7]); b = std::max(b, (int)d[k+8]); b0 = std::min(b0, std::max(b, (int)d[k])); b0 = std::min(b0, std::max(b, (int)d[k+9])); } threshold = -b0-1; #endif return threshold; }

void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i round, quant, dequant; { __m128i coeff0, coeff1; // Setup global values { round = _mm_load_si128((const __m128i*)round_ptr); quant = _mm_load_si128((const __m128i*)quant_ptr); dequant = _mm_load_si128((const __m128i*)dequant_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }

static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352); const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. -> 00 01 02 03 00 00 00 00 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Unpack and shuffle // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); // 00 01 10 11 02 03 12 13 // 20 21 30 31 22 23 32 33 const __m128i shuf01_p = _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i shuf23_p = _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 const __m128i a01 = _mm_add_epi16(s01, s32); const __m128i a32 = _mm_sub_epi16(s01, s32); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); v01 = _mm_unpacklo_epi32(s_lo, s_hi); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); _mm_storeu_si128((__m128i*)&out[0], d0_g1); _mm_storeu_si128((__m128i*)&out[8], d2_f3); } }

static int cornerScore(const uchar* ptr, const int pixel[], int threshold) { const int K = 8, N = 16 + K + 1; int k, v = ptr[0]; short d[N]; for( k = 0; k < N; k++ ) d[k] = (short)(v - ptr[pixel[k]]); #if CV_SSE2 __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000); for( k = 0; k < 16; k += 8 ) { __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1)); __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2)); __m128i a = _mm_min_epi16(v0, v1); __m128i b = _mm_max_epi16(v0, v1); v0 = _mm_loadu_si128((__m128i*)(d+k+3)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+4)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+5)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+6)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+7)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k+8)); a = _mm_min_epi16(a, v0); b = _mm_max_epi16(b, v0); v0 = _mm_loadu_si128((__m128i*)(d+k)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); v0 = _mm_loadu_si128((__m128i*)(d+k+9)); q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0)); q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0)); } q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1)); q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4)); q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2)); threshold = (short)_mm_cvtsi128_si32(q0) - 1; #else int a0 = threshold; for( k = 0; k < 16; k += 2 ) { int a = std::min((int)d[k+1], (int)d[k+2]); a = std::min(a, (int)d[k+3]); if( a <= a0 ) continue; a = std::min(a, (int)d[k+4]); a = std::min(a, (int)d[k+5]); a = std::min(a, (int)d[k+6]); a = std::min(a, (int)d[k+7]); a = std::min(a, (int)d[k+8]); a0 = std::max(a0, std::min(a, (int)d[k])); a0 = std::max(a0, std::min(a, (int)d[k+9])); } int b0 = -a0; for( k = 0; k < 16; k += 2 ) { int b = std::max((int)d[k+1], (int)d[k+2]); b = std::max(b, (int)d[k+3]); b = std::max(b, (int)d[k+4]); b = std::max(b, (int)d[k+5]); if( b >= b0 ) continue; b = std::max(b, (int)d[k+6]); b = std::max(b, (int)d[k+7]); b = std::max(b, (int)d[k+8]); b0 = std::min(b0, std::max(b, (int)d[k])); b0 = std::min(b0, std::max(b, (int)d[k+9])); } threshold = -b0-1; #endif #if 0 // check that with the computed "threshold" the pixel is still a corner // and that with the increased-by-1 "threshold" the pixel is not a corner anymore for( int delta = 0; delta <= 1; delta++ ) { int v0 = std::min(ptr[0] + threshold + delta, 255); int v1 = std::max(ptr[0] - threshold - delta, 0); int c0 = 0, c1 = 0; for( int k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > v0) { if( ++c0 > K ) break; c1 = 0; } else if( x < v1 ) { if( ++c1 > K ) break; c0 = 0; } else { c0 = c1 = 0; } } CV_Assert( (delta == 0 && std::max(c0, c1) > K) || (delta == 1 && std::max(c0, c1) <= K) ); } #endif return threshold; }

static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // extract sign(in) (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_cmpgt_epi16(zero, in0); const __m128i sign8 = _mm_cmpgt_epi16(zero, in8); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }

// Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load and combine inputs. { const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); // In SSE4.1, with gcc 4.8 at least (maybe other versions), // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump // of inA and inB, _mm_loadl_epi64 is still used not to have an out of // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); tmp_0 = _mm_cvtepu8_epi16(inAB_0); tmp_1 = _mm_cvtepu8_epi16(inAB_1); tmp_2 = _mm_cvtepu8_epi16(inAB_2); tmp_3 = _mm_cvtepu8_epi16(inAB_3); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 } // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b2); } return sum[0] + sum[1] + sum[2] + sum[3]; }

rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int y, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; } } }

/* * Calculate the Smith-Waterman score. * * This is basically an SSE2 version of Wozniak's vectored implementation, but * without a score table. Further, we assume a fixed database and query size, * so *nogap and *b_gap must be pre-allocated (the malloc overhead for very * small scans is _huge_). * * NOTE THE FOLLOWING: * * 1) seqA must be padded with 7 bytes at the beginning and end. The first * element of seqA should be the first pad byte. * * 2) seqB must be padded with bytes on the end up to mod 8 characters. * The first element of seqB should be (of course) the first character. * * 3) seqA and seqB's padding _must_ be different, otherwise our logic will * consider the padding as matches! * * 4) These is no _mm_max_epu16 prior to SSE 4! We must use the signed max * function. Unfortunately, this limits our maximum score to 2^15 - 1, or * 32767. Since bad things happen if we roll over, our caller must ensure * that this will not happen. */ static int vect_sw_diff_gap(int8_t *seqA, int lena, int8_t *seqB, int lenb, int8_t *ls_seqA, int initbp, bool is_rna) { int i, j, score = 0; __m128i v_score, v_zero, v_match, v_mismatch; __m128i v_a_gap_ext, v_a_gap_open_ext; #ifndef v_b_gap_open_ext __m128i v_b_gap_ext, v_b_gap_open_ext; #endif __m128i v_a_gap, v_b_gap, v_nogap; __m128i v_last_nogap, v_prev_nogap, v_seq_a, v_seq_b; __m128i v_tmp; /* shut up icc */ (void)ls_seqA; (void)initbp; #define SET16(a, e7, e6, e5, e4, e3, e2, e1, e0) \ _mm_set_epi16((int16_t)a[e7], (int16_t)a[e6], \ (int16_t)a[e5], (int16_t)a[e4], \ (int16_t)a[e3], (int16_t)a[e2], \ (int16_t)a[e1], (int16_t)a[e0]) v_score = _mm_setzero_si128(); v_zero = _mm_setzero_si128(); v_match = SET16((&match), 0, 0, 0, 0, 0, 0, 0, 0); v_mismatch = SET16((&mismatch), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_ext = SET16((&a_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = SET16((&a_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = _mm_add_epi16(v_a_gap_open_ext, v_a_gap_ext); v_b_gap_ext = SET16((&b_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = SET16((&b_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = _mm_add_epi16(v_b_gap_open_ext, v_b_gap_ext); for (i = 0; i < lena + 14; i++) { nogap[i] = 0; b_gap[i] = (int16_t)-b_gap_open; } /* * When using colour space reads, we must handle the first row * specially. This is because the read will begin with some marker * base, which will affect matching against the genome. * * For 25mer reads, this actually makes things faster, because our * vectorised portion becomes evenly divisible by 8 again. Yey. */ if (use_colours) { int a_gap, prev_nogap, last_nogap; a_gap = -a_gap_open; last_nogap = prev_nogap = 0; for (i = 7; i < (lena + 7); i++) { int a, ms; a_gap = MAX((last_nogap - a_gap_open - a_gap_ext), (a_gap - a_gap_ext)); b_gap[i] =(uint16_t)MAX((nogap[i] - b_gap_open - b_gap_ext), (b_gap[i] - b_gap_ext)); a = lstocs(ls_seqA[i], initbp, is_rna); ms = (a == seqB[0]) ? match : mismatch; last_nogap = MAX((prev_nogap + ms), 0); last_nogap = MAX(last_nogap, a_gap); last_nogap = MAX(last_nogap, b_gap[i]); prev_nogap = nogap[i]; nogap[i] = (uint16_t)last_nogap; score = MAX(score, last_nogap); } v_score = SET16((&score), 0, 0, 0, 0, 0, 0, 0, 0); score = 0; seqB++; lenb--; assert(lenb != 0); } for (i = 0; i < (lenb + 7)/8; i++) { int k = i * 8; v_b_gap = SET16(b_gap, 6, 6, 5, 4, 3, 2, 1, 0); v_nogap = SET16(nogap, 6, 6, 5, 4, 3, 2, 1, 0); v_seq_a = SET16(seqA, 0, 0, 1, 2, 3, 4, 5, 6); v_seq_b = SET16(seqB, k+7, k+6, k+5, k+4, k+3, k+2, k+1, k+0); v_a_gap = v_a_gap_ext; v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_open_ext); v_last_nogap = _mm_setzero_si128(); v_prev_nogap = _mm_setzero_si128(); for (j = 0; j < (lena + 7); j++) { v_b_gap = _mm_slli_si128(v_b_gap, 2); v_b_gap = _mm_insert_epi16(v_b_gap, b_gap[j+7], 0); v_nogap = _mm_slli_si128(v_nogap, 2); v_nogap = _mm_insert_epi16(v_nogap, nogap[j+7], 0); v_seq_a = _mm_slli_si128(v_seq_a, 2); v_seq_a = _mm_insert_epi16(v_seq_a, seqA[j+7], 0); v_tmp = _mm_sub_epi16(v_last_nogap, v_a_gap_open_ext); v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_ext); v_a_gap = _mm_max_epi16(v_a_gap, v_tmp); v_tmp = _mm_sub_epi16(v_nogap, v_b_gap_open_ext); v_b_gap = _mm_sub_epi16(v_b_gap, v_b_gap_ext); v_b_gap = _mm_max_epi16(v_b_gap, v_tmp); /* compute the score (v_last_nogap is a tmp variable) */ v_last_nogap = _mm_cmpeq_epi16(v_seq_a, v_seq_b); v_tmp = _mm_and_si128(v_last_nogap, v_match); v_last_nogap = _mm_cmpeq_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_and_si128(v_last_nogap, v_mismatch); v_tmp = _mm_or_si128(v_tmp, v_last_nogap); v_last_nogap = _mm_add_epi16(v_prev_nogap, v_tmp); v_last_nogap = _mm_max_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_max_epi16(v_last_nogap, v_a_gap); v_last_nogap = _mm_max_epi16(v_last_nogap, v_b_gap); v_prev_nogap = v_nogap; v_nogap = v_last_nogap; b_gap[j] = (int16_t)_mm_extract_epi16(v_b_gap, 7); nogap[j] = (int16_t)_mm_extract_epi16(v_nogap, 7); v_score = _mm_max_epi16(v_score, v_last_nogap); } } /* * Ugh. Old gcc can't loop and using _mm_store to an int16_t array * breaks strict-aliasing rules. */ assert(score == 0); score = MAX(score, _mm_extract_epi16(v_score, 0)); score = MAX(score, _mm_extract_epi16(v_score, 1)); score = MAX(score, _mm_extract_epi16(v_score, 2)); score = MAX(score, _mm_extract_epi16(v_score, 3)); score = MAX(score, _mm_extract_epi16(v_score, 4)); score = MAX(score, _mm_extract_epi16(v_score, 5)); score = MAX(score, _mm_extract_epi16(v_score, 6)); score = MAX(score, _mm_extract_epi16(v_score, 7)); return (score); }

mlib_status mlib_VideoColorJFIFYCC2RGB444_S16_naligned( mlib_s16 *rgb, const mlib_s16 *y, const mlib_s16 *cb, const mlib_s16 *cr, mlib_s32 n) { /* 0 & 1.402*16384 */ const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970, 0, 22970, 0, 22970); /* -0.34414*16384 & -0.71414*16384 */ const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700, -5638, -11700, -5638, -11700); /* 1.772*16384 & 0 */ const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0, 29032, 0, 29032, 0); const __m128i x_coff = _mm_set1_epi16(2048); const __m128i x_cps1 = _mm_set1_epi32(0x8000); const __m128i x_cps2 = _mm_set1_epi16(0x8000); const __m128i x_zero = _mm_setzero_si128(); const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0); const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0); /* __m128i variables */ __m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2; __m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2; __m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh; __m128i x_cbcr1, x_cbcr2; /* pointers */ __m128i *px_y, *px_cb, *px_cr; mlib_s16 *prgb; /* other var */ mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; prgb = rgb; i = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; i <= n - 16; i += 8) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbh); } if (i <= (n - 8)) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB2(x_rgbh); i += 8; } if (i <= (n - 4)) { x_y = _mm_loadl_epi64(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); px_y = (__m128i *)(((__m64 *)px_y) + 1); x_cb = _mm_loadl_epi64(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb = (__m128i *)(((__m64 *)px_cb) + 1); x_cr = _mm_loadl_epi64(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr = (__m128i *)(((__m64 *)px_cr) + 1); x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r = _mm_packs_epi32(x_r1, x_zero); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g = _mm_packs_epi32(x_g1, x_zero); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b = _mm_packs_epi32(x_b1, x_zero); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB2(x_rgbh); i += 4; } /* pure C implementation */ for (; i < n; i++) { fy = y[i] * SCALE - SAT; fcb = (mlib_d64)((cb[i] - 2048) << 20); fcr = (mlib_d64)((cr[i] - 2048) << 20); fr = fy + 1.40200f * fcr; fg = fy - 0.34414f * fcb - 0.71414f * fcr; fb = fy + 1.77200f * fcb; rgb[3 * i] = CLAMP_U12(fr); rgb[3 * i + 1] = CLAMP_U12(fg); rgb[3 * i + 2] = CLAMP_U12(fb); } return (MLIB_SUCCESS); }

test (__m128i s1, __m128i s2) { return _mm_sub_epi16 (s1, s2); }

int32_t j = 0; int32_t end_query = s1Len-1; int32_t end_ref = s2Len-1; int16_t score = NEG_INF; __m128i vNegInf = _mm_set1_epi16(NEG_INF); __m128i vOpen = _mm_set1_epi16(open); __m128i vGap = _mm_set1_epi16(gap); __m128i vOne = _mm_set1_epi16(1); __m128i vN = _mm_set1_epi16(N); __m128i vGapN = _mm_set1_epi16(gap*N); __m128i vNegOne = _mm_set1_epi16(-1); __m128i vI = _mm_set_epi16(0,1,2,3,4,5,6,7); __m128i vJreset = _mm_set_epi16(0,-1,-2,-3,-4,-5,-6,-7); __m128i vMax = vNegInf; __m128i vILimit = _mm_set1_epi16(s1Len); __m128i vILimit1 = _mm_sub_epi16(vILimit, vOne); __m128i vJLimit = _mm_set1_epi16(s2Len); __m128i vJLimit1 = _mm_sub_epi16(vJLimit, vOne); __m128i vIBoundary = _mm_set_epi16( -open-0*gap, -open-1*gap, -open-2*gap, -open-3*gap, -open-4*gap, -open-5*gap, -open-6*gap, -open-7*gap ); /* convert _s1 from char to int in range 0-23 */

/* * Calculate the Smith-Waterman score. * * This is basically an SSE2 version of Wozniak's vectored implementation, but * without a score table. Further, we assume a fixed database and query size, * so *nogap and *b_gap must be pre-allocated (the malloc overhead for very * small scans is _huge_). * * NOTE THE FOLLOWING: * * 1) seqA must be padded with 7 bytes at the beginning and end. The first * element of seqA should be the first pad byte. * * 2) seqB must be padded with bytes on the end up to mod 8 characters. * The first element of seqB should be (of course) the first character. * * 3) seqA and seqB's padding _must_ be different, otherwise our logic will * consider the padding as matches! * * 4) These is no _mm_max_epu16 prior to SSE 4! We must use the signed max * function. Unfortunately, this limits our maximum score to 2^15 - 1, or * 32767. Since bad things happen if we roll over, our caller must ensure * that this will not happen. */ static int vect_sw_diff_gap(int8_t *seqA, int lena, int8_t *seqB, int lenb, int8_t *ls_seqA, int initbp, bool is_rna) { int i, j, score = 0; __m128i v_score, v_zero, v_match, v_mismatch; __m128i v_a_gap_ext, v_a_gap_open_ext; #ifndef v_b_gap_open_ext __m128i v_b_gap_ext, v_b_gap_open_ext; #endif __m128i v_a_gap, v_b_gap, v_nogap; __m128i v_last_nogap, v_prev_nogap, v_seq_a, v_seq_b; __m128i v_tmp; /* shut up icc */ (void)ls_seqA; (void)initbp; #define SET16(a, e7, e6, e5, e4, e3, e2, e1, e0) \ _mm_set_epi16((int16_t)a[e7], (int16_t)a[e6], \ (int16_t)a[e5], (int16_t)a[e4], \ (int16_t)a[e3], (int16_t)a[e2], \ (int16_t)a[e1], (int16_t)a[e0]) v_score = _mm_setzero_si128(); v_zero = _mm_setzero_si128(); v_match = SET16((&match), 0, 0, 0, 0, 0, 0, 0, 0); v_mismatch = SET16((&mismatch), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_ext = SET16((&a_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = SET16((&a_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_a_gap_open_ext = _mm_add_epi16(v_a_gap_open_ext, v_a_gap_ext); v_b_gap_ext = SET16((&b_gap_ext), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = SET16((&b_gap_open), 0, 0, 0, 0, 0, 0, 0, 0); v_b_gap_open_ext = _mm_add_epi16(v_b_gap_open_ext, v_b_gap_ext); for (i = 0; i < lena + 14; i++) { nogap[i] = 0; b_gap[i] = (int16_t)-b_gap_open; } for (i = 0; i < (lenb + 7)/8; i++) { int k = i * 8; v_b_gap = SET16(b_gap, 6, 6, 5, 4, 3, 2, 1, 0); v_nogap = SET16(nogap, 6, 6, 5, 4, 3, 2, 1, 0); v_seq_a = SET16(seqA, 0, 0, 1, 2, 3, 4, 5, 6); v_seq_b = SET16(seqB, k+7, k+6, k+5, k+4, k+3, k+2, k+1, k+0); v_a_gap = v_a_gap_ext; v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_open_ext); v_last_nogap = _mm_setzero_si128(); v_prev_nogap = _mm_setzero_si128(); for (j = 0; j < (lena + 7); j++) { v_b_gap = _mm_slli_si128(v_b_gap, 2); v_b_gap = _mm_insert_epi16(v_b_gap, b_gap[j+7], 0); v_nogap = _mm_slli_si128(v_nogap, 2); v_nogap = _mm_insert_epi16(v_nogap, nogap[j+7], 0); v_seq_a = _mm_slli_si128(v_seq_a, 2); v_seq_a = _mm_insert_epi16(v_seq_a, seqA[j+7], 0); v_tmp = _mm_sub_epi16(v_last_nogap, v_a_gap_open_ext); v_a_gap = _mm_sub_epi16(v_a_gap, v_a_gap_ext); v_a_gap = _mm_max_epi16(v_a_gap, v_tmp); v_tmp = _mm_sub_epi16(v_nogap, v_b_gap_open_ext); v_b_gap = _mm_sub_epi16(v_b_gap, v_b_gap_ext); v_b_gap = _mm_max_epi16(v_b_gap, v_tmp); /* compute the score (v_last_nogap is a tmp variable) */ v_last_nogap = _mm_cmpeq_epi16(v_seq_a, v_seq_b); v_tmp = _mm_and_si128(v_last_nogap, v_match); v_last_nogap = _mm_cmpeq_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_and_si128(v_last_nogap, v_mismatch); v_tmp = _mm_or_si128(v_tmp, v_last_nogap); v_last_nogap = _mm_add_epi16(v_prev_nogap, v_tmp); v_last_nogap = _mm_max_epi16(v_last_nogap, v_zero); v_last_nogap = _mm_max_epi16(v_last_nogap, v_a_gap); v_last_nogap = _mm_max_epi16(v_last_nogap, v_b_gap); v_prev_nogap = v_nogap; v_nogap = v_last_nogap; b_gap[j] = (int16_t)_mm_extract_epi16(v_b_gap, 7); nogap[j] = (int16_t)_mm_extract_epi16(v_nogap, 7); v_score = _mm_max_epi16(v_score, v_last_nogap); } } /* * Ugh. Old gcc can't loop and using _mm_store to an int16_t array * breaks strict-aliasing rules. */ assert(score == 0); score = MAX(score, _mm_extract_epi16(v_score, 0)); score = MAX(score, _mm_extract_epi16(v_score, 1)); score = MAX(score, _mm_extract_epi16(v_score, 2)); score = MAX(score, _mm_extract_epi16(v_score, 3)); score = MAX(score, _mm_extract_epi16(v_score, 4)); score = MAX(score, _mm_extract_epi16(v_score, 5)); score = MAX(score, _mm_extract_epi16(v_score, 6)); score = MAX(score, _mm_extract_epi16(v_score, 7)); return (score); }

EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_16bit_SSE2_INTRIN( EB_U16 *inputSamplePtr, // input parameter, source Picture Ptr EB_U32 inputStride, // input parameter, source stride EB_U16 *reconSamplePtr, // input parameter, deblocked Picture Ptr EB_U32 reconStride, // input parameter, deblocked stride EB_U32 lcuWidth, // input parameter, LCU width EB_U32 lcuHeight, // input parameter, LCU height EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1], // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES] EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1]) // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] { #define boShift 5 EB_ERRORTYPE return_error = EB_ErrorNone; EB_U64 count_x, count_y; EB_S32 diff; __m128i xmm0, xmm_1, xmm_N1, xmm_N3, xmm_N4, xmm_skip_mask, xmm9, xmm10, xmm11, xmm12, xmm13, xmm15; __m128i xmm_temp_input1, xmm_temp_input2, xmm_temp_recon1, xmm_temp_recon2, xmm_diff1, xmm_diff2; __m128i xmm_sign_1, xmm_sign_1a, xmm_sign_1b, xmm_sign_2a, xmm_sign_2b, xmm_sign_2, xmm_eoIndex; xmm0 = _mm_setzero_si128(); xmm12 = _mm_setzero_si128(); xmm15 = _mm_set1_epi16(0x0001); xmm_N1 = _mm_set1_epi8((signed char)0xFF); xmm_N3 = _mm_set1_epi8((signed char)0xFD); xmm_N4 = _mm_set1_epi8((signed char)0xFC); xmm_1 = _mm_sub_epi8(xmm0, xmm_N1); // Initialize SAO Arrays EB_ALIGN(16) EB_S8 rTemp[512] = { 0 }; EB_U64 reconStrideTemp; lcuHeight -= 2; inputSamplePtr += inputStride + 1; reconSamplePtr++; if (lcuWidth == 16) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 2; } else if (lcuWidth == 28) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 6); for (count_y = 0; count_y < lcuHeight; ++count_y) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) //----------- 16-25 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 16)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 24)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 16)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 24)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 12); //skip last 6 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 12); //skip last 6 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr+16, reconSamplePtr+2*reconStride+16) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr+15, reconSamplePtr+2*reconStride+17) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+17, reconSamplePtr+2*reconStride+15) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 6; } else if (lcuWidth == 56) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 10); lcuWidth -= 8; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-53 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff1 = _mm_slli_si128(xmm_diff1, 4); //skip last 10 samples xmm_diff1 = _mm_srli_si128(xmm_diff1, 4); //skip last 10 samples // EO-90 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 10; } else { lcuWidth -= 16; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); //EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) //EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) //EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-61 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 2; } lcuWidth = (EB_U16)lcuWidth * (EB_U16)lcuHeight; MACRO_SAVE_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1, 1) MACRO_SAVE_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2, 2) MACRO_SAVE_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3, 3) return return_error; }

/* Input image must be large enough to have valid pixels for the offset (dx,dy). I.e., with (dx,dy)=(-10,8), x-value up to -10 and y-values up to (h-1)+8 will be accessed. The integral image will be access with (x,y) in [-1,w)x[-1,h). Note also that we use 32bit for the integral image even though the values may overflow that range. However, the modulo-arithmetic used when computing the block sums later will be still correct when the block size is not too large. */ static void buildIntegralImage_SSE(uint32_t* integral_image, int integral_stride, const uint8_t* current_image, int current_image_stride, const uint8_t* compare_image, int compare_image_stride, int w,int hStart, int hEnd, int dx,int dy) { const __m128i zero = _mm_set1_epi8(0); memset(integral_image -1 -integral_stride, 0, (w+1)*sizeof(uint32_t)); for (int y=hStart;y<hEnd;y++) { const uint8_t* p1 = current_image + y *current_image_stride; const uint8_t* p2 = compare_image + (y+dy)*compare_image_stride + dx; uint32_t* out = integral_image + y*integral_stride-1; __m128i prevadd = _mm_set1_epi32(0); const int pixels_step = 16; *out++ = 0; for (int x=0 ; x<w ; x+=pixels_step) { __m128i pa, pb; __m128i pla, plb; __m128i ldiff, lldiff, lhdiff; __m128i ltmp,htmp; __m128i ladd,hadd; __m128i pha,phb; __m128i hdiff,hldiff,hhdiff; __m128i l2tmp,h2tmp; pa = _mm_loadu_si128((__m128i*)p1); pb = _mm_loadu_si128((__m128i*)p2); pla = _mm_unpacklo_epi8(pa,zero); plb = _mm_unpacklo_epi8(pb,zero); ldiff = _mm_sub_epi16(pla,plb); ldiff = _mm_mullo_epi16(ldiff,ldiff); lldiff = _mm_unpacklo_epi16(ldiff,zero); lhdiff = _mm_unpackhi_epi16(ldiff,zero); ltmp = _mm_slli_si128(lldiff, 4); lldiff = _mm_add_epi32(lldiff, ltmp); ltmp = _mm_slli_si128(lldiff, 8); lldiff = _mm_add_epi32(lldiff, ltmp); lldiff = _mm_add_epi32(lldiff, prevadd); ladd = _mm_shuffle_epi32(lldiff, 0xff); htmp = _mm_slli_si128(lhdiff, 4); lhdiff = _mm_add_epi32(lhdiff, htmp); htmp = _mm_slli_si128(lhdiff, 8); lhdiff = _mm_add_epi32(lhdiff, htmp); lhdiff = _mm_add_epi32(lhdiff, ladd); prevadd = _mm_shuffle_epi32(lhdiff, 0xff); _mm_store_si128((__m128i*)(out), lldiff); _mm_store_si128((__m128i*)(out+4),lhdiff); pha = _mm_unpackhi_epi8(pa,zero); phb = _mm_unpackhi_epi8(pb,zero); hdiff = _mm_sub_epi16(pha,phb); hdiff = _mm_mullo_epi16(hdiff,hdiff); hldiff = _mm_unpacklo_epi16(hdiff,zero); hhdiff = _mm_unpackhi_epi16(hdiff,zero); l2tmp = _mm_slli_si128(hldiff, 4); hldiff = _mm_add_epi32(hldiff, l2tmp); l2tmp = _mm_slli_si128(hldiff, 8); hldiff = _mm_add_epi32(hldiff, l2tmp); hldiff = _mm_add_epi32(hldiff, prevadd); hadd = _mm_shuffle_epi32(hldiff, 0xff); h2tmp = _mm_slli_si128(hhdiff, 4); hhdiff = _mm_add_epi32(hhdiff, h2tmp); h2tmp = _mm_slli_si128(hhdiff, 8); hhdiff = _mm_add_epi32(hhdiff, h2tmp); hhdiff = _mm_add_epi32(hhdiff, hadd); prevadd = _mm_shuffle_epi32(hhdiff, 0xff); _mm_store_si128((__m128i*)(out+8), hldiff); _mm_store_si128((__m128i*)(out+12),hhdiff); out+=pixels_step; p1 +=pixels_step; p2 +=pixels_step; } if (y>0) { out = integral_image + y*integral_stride; for (int x=0 ; x<w ; x+=pixels_step) { *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), *(__m128i*)(out)); *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride), *(__m128i*)(out+4)); *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride), *(__m128i*)(out+8)); *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride), *(__m128i*)(out+12)); out += 4*4; } } } }

static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { uint8_t levels[16], ctxs[16]; uint16_t abs_levels[16]; int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; CostArrayPtr const costs = res->costs; const uint16_t* t = costs[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; if (res->last < 0) { return VP8BitCost(0, p0); } { // precompute clamped levels and contexts, packed to 8b. const __m128i zero = _mm_setzero_si128(); const __m128i kCst2 = _mm_set1_epi8(2); const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL); const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]); const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]); const __m128i D0_m = _mm_min_epi16(c0, zero); const __m128i D0_p = _mm_max_epi16(c0, zero); const __m128i D1_m = _mm_min_epi16(c1, zero); const __m128i D1_p = _mm_max_epi16(c1, zero); const __m128i E0 = _mm_sub_epi16(D0_p, D0_m); // abs(v), 16b const __m128i E1 = _mm_sub_epi16(D1_p, D1_m); const __m128i F = _mm_packs_epi16(E0, E1); const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2 const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67] _mm_storeu_si128((__m128i*)&ctxs[0], G); _mm_storeu_si128((__m128i*)&levels[0], H); _mm_storeu_si128((__m128i*)&abs_levels[0], E0); _mm_storeu_si128((__m128i*)&abs_levels[8], E1); } for (; n < res->last; ++n) { const int ctx = ctxs[n]; const int level = levels[n]; const int flevel = abs_levels[n]; // full level cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() t = costs[n + 1][ctx]; } // Last coefficient is always non-zero { const int level = levels[n]; const int flevel = abs_levels[n]; assert(flevel != 0); cost += VP8LevelFixedCosts[flevel] + t[level]; if (n < 15) { const int b = VP8EncBands[n + 1]; const int ctx = ctxs[n]; const int last_p0 = res->prob[b][ctx][0]; cost += VP8BitCost(0, last_p0); } } return cost; }

void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); in2 = _mm_slli_epi16(in2, 2); in3 = _mm_slli_epi16(in3, 2); in4 = _mm_slli_epi16(in4, 2); in5 = _mm_slli_epi16(in5, 2); in6 = _mm_slli_epi16(in6, 2); in7 = _mm_slli_epi16(in7, 2); // We do two passes, first the columns, then the rows. The results of the // first pass are transposed so that the same column code can be reused. The // results of the second pass are also transposed so that the rows (processed // as columns) are put back in row positions. for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; // Add/substract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); const __m128i q3 = _mm_add_epi16(in3, in4); const __m128i q4 = _mm_sub_epi16(in3, in4); const __m128i q5 = _mm_sub_epi16(in2, in5); const __m128i q6 = _mm_sub_epi16(in1, in6); const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res0 = _mm_packs_epi32(w0, w1); res4 = _mm_packs_epi32(w2, w3); res2 = _mm_packs_epi32(w4, w5); res6 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us into 32bits const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res1 = _mm_packs_epi32(w0, w1); res7 = _mm_packs_epi32(w2, w3); res5 = _mm_packs_epi32(w4, w5); res3 = _mm_packs_epi32(w6, w7); } // Transpose the 8x8. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 } } // Post-condition output and store it { // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); const __m128i sign_in1 = _mm_srai_epi16(in1, 15); const __m128i sign_in2 = _mm_srai_epi16(in2, 15); const __m128i sign_in3 = _mm_srai_epi16(in3, 15); const __m128i sign_in4 = _mm_srai_epi16(in4, 15); const __m128i sign_in5 = _mm_srai_epi16(in5, 15); const __m128i sign_in6 = _mm_srai_epi16(in6, 15); const __m128i sign_in7 = _mm_srai_epi16(in7, 15); in0 = _mm_sub_epi16(in0, sign_in0); in1 = _mm_sub_epi16(in1, sign_in1); in2 = _mm_sub_epi16(in2, sign_in2); in3 = _mm_sub_epi16(in3, sign_in3); in4 = _mm_sub_epi16(in4, sign_in4); in5 = _mm_sub_epi16(in5, sign_in5); in6 = _mm_sub_epi16(in6, sign_in6); in7 = _mm_sub_epi16(in7, sign_in7); in0 = _mm_srai_epi16(in0, 1); in1 = _mm_srai_epi16(in1, 1); in2 = _mm_srai_epi16(in2, 1); in3 = _mm_srai_epi16(in3, 1); in4 = _mm_srai_epi16(in4, 1); in5 = _mm_srai_epi16(in5, 1); in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results _mm_storeu_si128((__m128i *)(output + 0 * 8), in0); _mm_storeu_si128((__m128i *)(output + 1 * 8), in1); _mm_storeu_si128((__m128i *)(output + 2 * 8), in2); _mm_storeu_si128((__m128i *)(output + 3 * 8), in3); _mm_storeu_si128((__m128i *)(output + 4 * 8), in4); _mm_storeu_si128((__m128i *)(output + 5 * 8), in5); _mm_storeu_si128((__m128i *)(output + 6 * 8), in6); _mm_storeu_si128((__m128i *)(output + 7 * 8), in7); } }