SIMDValue SIMDInt8x16Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue x86tmp1; X86SIMDValue x86tmp2; X86SIMDValue x86tmp3; const _x86_SIMDValue X86_LOWBYTE_MASK = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff }; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); // (ah* 2^8 + al) * (bh *2^8 + bl) = (ah*bh* 2^8 + al*bh + ah* bl) * 2^8 + al * bl x86tmp1.m128i_value = _mm_mullo_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); x86tmp2.m128i_value = _mm_and_si128(x86tmp1.m128i_value, X86_LOWBYTE_MASK.m128i_value); tmpaValue.m128i_value = _mm_srli_epi16(tmpaValue.m128i_value, 8); tmpbValue.m128i_value = _mm_srli_epi16(tmpbValue.m128i_value, 8); x86tmp3.m128i_value = _mm_mullo_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); x86tmp3.m128i_value = _mm_slli_epi16(x86tmp3.m128i_value, 8); x86Result.m128i_value = _mm_or_si128(x86tmp2.m128i_value, x86tmp3.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
__m128i interpolhline128(unsigned char* image){ __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; unsigned char* imagetmp = image - 2; xmm7 = _mm_setzero_si128(); xmm6 = _mm_loadu_si128(((__m128i*)imagetmp)); xmm0 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm1 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm2 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm3 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm4 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm5 = _mm_unpacklo_epi8(xmm6,xmm7); // filter on 8 values xmm6 = _mm_add_epi16(xmm2,xmm3); xmm6 = _mm_slli_epi16(xmm6,2); xmm6 = _mm_sub_epi16(xmm6,xmm1); xmm6 = _mm_sub_epi16(xmm6,xmm4); xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005); xmm6 = _mm_mullo_epi16(xmm6,xmm1); xmm6 = _mm_add_epi16(xmm6,xmm0); xmm6 = _mm_add_epi16(xmm6,xmm5); //xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values _mm_empty(); return(xmm6); }
static void TransformColorInverse(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend const __m128i mults_rb = _mm_set_epi16( CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST(red_to_blue_), 0, CST(red_to_blue_), 0, CST(red_to_blue_), 0, CST(red_to_blue_), 0); #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
/** * @brief mux all audio ports to events * @param data * @param offset * @param nevents */ void AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data, unsigned int offset, unsigned int nevents) { unsigned int j; quadlet_t *target_event; int i; uint32_t *client_buffers[4]; uint32_t tmp_values[4] __attribute__ ((aligned (16))); // prepare the scratch buffer assert(m_scratch_buffer_size_bytes > nevents * 4); memset(m_scratch_buffer, 0, nevents * 4); const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000); const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF); // this assumes that audio ports are sorted by position, // and that there are no gaps for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) { struct _MBLA_port_cache *p; // get the port buffers for (j=0; j<4; j++) { p = &(m_audio_ports.at(i+j)); if(likely(p->buffer && p->enabled)) { client_buffers[j] = (uint32_t *) p->buffer; client_buffers[j] += offset; } else { // if a port is disabled or has no valid // buffer, use the scratch buffer (all zero's) client_buffers[j] = (uint32_t *) m_scratch_buffer; } } // the base event for this position target_event = (quadlet_t *)(data + i); // process the events for (j=0;j < nevents; j += 1) { // read the values tmp_values[0] = *(client_buffers[0]); tmp_values[1] = *(client_buffers[1]); tmp_values[2] = *(client_buffers[2]); tmp_values[3] = *(client_buffers[3]); // now do the SSE based conversion/labeling __m128i *target = (__m128i*)target_event; __m128i v_int = *((__m128i*)tmp_values);; // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int // (target misalignment is assumed since we don't know the m_dimension) _mm_storeu_si128 (target, v_int); // increment the buffer pointers client_buffers[0]++; client_buffers[1]++; client_buffers[2]++; client_buffers[3]++; // go to next target event position target_event += m_dimension; } } // do remaining ports // NOTE: these can be time-SSE'd for (; i < ((int)m_nb_audio_ports); i++) { struct _MBLA_port_cache &p = m_audio_ports.at(i); target_event = (quadlet_t *)(data + i); #ifdef DEBUG assert(nevents + offset <= p.buffer_size ); #endif if(likely(p.buffer && p.enabled)) { uint32_t *buffer = (uint32_t *)(p.buffer); buffer += offset; for (j = 0;j < nevents; j += 4) { // read the values tmp_values[0] = *buffer; buffer++; tmp_values[1] = *buffer; buffer++; tmp_values[2] = *buffer; buffer++; tmp_values[3] = *buffer; buffer++; // now do the SSE based conversion/labeling __m128i v_int = *((__m128i*)tmp_values);; // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int _mm_store_si128 ((__m128i *)(&tmp_values), v_int); // increment the buffer pointers *target_event = tmp_values[0]; target_event += m_dimension; *target_event = tmp_values[1]; target_event += m_dimension; *target_event = tmp_values[2]; target_event += m_dimension; *target_event = tmp_values[3]; target_event += m_dimension; } // do the remainder of the events for(;j < nevents; j += 1) { uint32_t in = (uint32_t)(*buffer); *target_event = CondSwapToBus32((quadlet_t)((in & 0x00FFFFFF) | 0x40000000)); buffer++; target_event += m_dimension; } } else { for (j = 0;j < nevents; j += 1) { // hardcoded byte swapped *target_event = 0x00000040; target_event += m_dimension; } } } }
rfx_dwt_2d_decode_block_vert_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int x, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; __m128i l_n; __m128i h_n; __m128i tmp_n; __m128i h_n_m; __m128i dst_n; __m128i dst_n_m; __m128i dst_n_p; int total_width = subband_width + subband_width; /* Even coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; if (n == 0) tmp_n = _mm_add_epi16(tmp_n, h_n); else { h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); tmp_n = _mm_add_epi16(tmp_n, h_n_m); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } h_ptr = h; dst_ptr = dst + total_width; /* Odd coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); h_n = _mm_slli_epi16(h_n, 1); tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = _mm_add_epi16(tmp_n, dst_n_m); else { dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); tmp_n = _mm_add_epi16(tmp_n, dst_n_p); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_add_epi16(tmp_n, h_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } }
// Data and ECC addresses must be properly aligned for SSE. bool RSCoder16::SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize) { // Check data alignment and SSSE3 support. if ((size_t(Data) & (SSE_ALIGNMENT-1))!=0 || (size_t(ECC) & (SSE_ALIGNMENT-1))!=0 || _SSE_Version<SSE_SSSE3) return false; uint M=MX[ECCNum * ND + DataNum]; // Prepare tables containing products of M and 4, 8, 12, 16 bit length // numbers, which have 4 high bits in 0..15 range and other bits set to 0. // Store high and low bytes of resulting 16 bit product in separate tables. __m128i T0L,T1L,T2L,T3L; // Low byte tables. __m128i T0H,T1H,T2H,T3H; // High byte tables. for (uint I=0; I<16; I++) { ((byte *)&T0L)[I]=gfMul(I,M); ((byte *)&T0H)[I]=gfMul(I,M)>>8; ((byte *)&T1L)[I]=gfMul(I<<4,M); ((byte *)&T1H)[I]=gfMul(I<<4,M)>>8; ((byte *)&T2L)[I]=gfMul(I<<8,M); ((byte *)&T2H)[I]=gfMul(I<<8,M)>>8; ((byte *)&T3L)[I]=gfMul(I<<12,M); ((byte *)&T3H)[I]=gfMul(I<<12,M)>>8; } size_t Pos=0; __m128i LowByteMask=_mm_set1_epi16(0xff); // 00ff00ff...00ff __m128i Low4Mask=_mm_set1_epi8(0xf); // 0f0f0f0f...0f0f __m128i High4Mask=_mm_slli_epi16(Low4Mask,4); // f0f0f0f0...f0f0 for (; Pos+2*sizeof(__m128i)<=BlockSize; Pos+=2*sizeof(__m128i)) { // We process two 128 bit chunks of source data at once. __m128i *D=(__m128i *)(Data+Pos); // Place high bytes of both chunks to one variable and low bytes to // another, so we can use the table lookup multiplication for 16 values // 4 bit length each at once. __m128i HighBytes0=_mm_srli_epi16(D[0],8); __m128i LowBytes0=_mm_and_si128(D[0],LowByteMask); __m128i HighBytes1=_mm_srli_epi16(D[1],8); __m128i LowBytes1=_mm_and_si128(D[1],LowByteMask); __m128i HighBytes=_mm_packus_epi16(HighBytes0,HighBytes1); __m128i LowBytes=_mm_packus_epi16(LowBytes0,LowBytes1); // Multiply bits 0..3 of low bytes. Store low and high product bytes // separately in cumulative sum variables. __m128i LowBytesLow4=_mm_and_si128(LowBytes,Low4Mask); __m128i LowBytesMultSum=_mm_shuffle_epi8(T0L,LowBytesLow4); __m128i HighBytesMultSum=_mm_shuffle_epi8(T0H,LowBytesLow4); // Multiply bits 4..7 of low bytes. Store low and high product bytes separately. __m128i LowBytesHigh4=_mm_and_si128(LowBytes,High4Mask); LowBytesHigh4=_mm_srli_epi16(LowBytesHigh4,4); __m128i LowBytesHigh4MultLow=_mm_shuffle_epi8(T1L,LowBytesHigh4); __m128i LowBytesHigh4MultHigh=_mm_shuffle_epi8(T1H,LowBytesHigh4); // Add new product to existing sum, low and high bytes separately. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,LowBytesHigh4MultLow); HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,LowBytesHigh4MultHigh); // Multiply bits 0..3 of high bytes. Store low and high product bytes separately. __m128i HighBytesLow4=_mm_and_si128(HighBytes,Low4Mask); __m128i HighBytesLow4MultLow=_mm_shuffle_epi8(T2L,HighBytesLow4); __m128i HighBytesLow4MultHigh=_mm_shuffle_epi8(T2H,HighBytesLow4); // Add new product to existing sum, low and high bytes separately. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesLow4MultLow); HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesLow4MultHigh); // Multiply bits 4..7 of high bytes. Store low and high product bytes separately. __m128i HighBytesHigh4=_mm_and_si128(HighBytes,High4Mask); HighBytesHigh4=_mm_srli_epi16(HighBytesHigh4,4); __m128i HighBytesHigh4MultLow=_mm_shuffle_epi8(T3L,HighBytesHigh4); __m128i HighBytesHigh4MultHigh=_mm_shuffle_epi8(T3H,HighBytesHigh4); // Add new product to existing sum, low and high bytes separately. LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesHigh4MultLow); HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesHigh4MultHigh); // Combine separate low and high cumulative sum bytes to 16-bit words. __m128i HighBytesHigh4Mult0=_mm_unpacklo_epi8(LowBytesMultSum,HighBytesMultSum); __m128i HighBytesHigh4Mult1=_mm_unpackhi_epi8(LowBytesMultSum,HighBytesMultSum); // Add result to ECC. __m128i *StoreECC=(__m128i *)(ECC+Pos); StoreECC[0]=_mm_xor_si128(StoreECC[0],HighBytesHigh4Mult0); StoreECC[1]=_mm_xor_si128(StoreECC[1],HighBytesHigh4Mult1); } // If we have non 128 bit aligned data in the end of block, process them // in a usual way. We cannot do the same in the beginning of block, // because Data and ECC can have different alignment offsets. for (; Pos<BlockSize; Pos+=2) *(ushort*)(ECC+Pos) ^= gfMul( M, *(ushort*)(Data+Pos) ); return true; }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1, csum; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* mask the lower byte of ol_flags */ const __m128i ol_flags_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x00FF, 0x00FF, 0x00FF, 0x00FF); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present and l4/ip csum error */ const __m128i vlan_csum_msk = _mm_set_epi16( (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8), IPE (0x2), L4E (0x1) to ol_flags */ const __m128i vlan_csum_map_lo = _mm_set_epi8( 0, 0, 0, 0, vlan_flags | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_GOOD, 0, 0, 0, 0, PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD); const __m128i vlan_csum_map_hi = _mm_set_epi8( 0, 0, 0, 0, 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 0, 0, 0, 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t)); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_csum_msk); /* csum bits are in the most significant, to use shuffle we need to * shift them. Change mask to 0xc000 to 0x0003. */ csum = _mm_srli_epi16(vtag1, 14); /* now or the most significant 64 bits containing the checksum * flags with the vlan present flags. */ csum = _mm_srli_si128(csum, 8); vtag1 = _mm_or_si128(csum, vtag1); /* convert VP, IPE, L4E to ol_flags */ vtag0 = _mm_shuffle_epi8(vlan_csum_map_hi, vtag1); vtag0 = _mm_slli_epi16(vtag0, sizeof(uint8_t)); vtag1 = _mm_shuffle_epi8(vlan_csum_map_lo, vtag1); vtag1 = _mm_and_si128(vtag1, ol_flags_msk); vtag1 = _mm_or_si128(vtag0, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer) { __m128i min = _mm_set1_epi16(-128 << 5); __m128i max = _mm_set1_epi16(127 << 5); __m128i* y_r_buf = (__m128i*) y_r_buffer; __m128i* cb_g_buf = (__m128i*) cb_g_buffer; __m128i* cr_b_buf = (__m128i*) cr_b_buffer; __m128i y; __m128i cr; __m128i cb; __m128i r; __m128i g; __m128i b; int i; for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); } for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { /* r = y_r_buf[i]; */ r = _mm_load_si128(&y_r_buf[i]); /* g = cb_g_buf[i]; */ g = _mm_load_si128(&cb_g_buf[i]); /* b = cr_b_buf[i]; */ b = _mm_load_si128(&cr_b_buf[i]); /* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) + ((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) + ((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */ /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ y = _mm_add_epi16(_mm_slli_epi16(r, 3), r); y = _mm_add_epi16(y, _mm_srai_epi16(r, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(r, 4)); y = _mm_add_epi16(y, _mm_srai_epi16(r, 7)); y = _mm_add_epi16(y, _mm_slli_epi16(g, 4)); y = _mm_add_epi16(y, _mm_slli_epi16(g, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 2)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 5)); y = _mm_add_epi16(y, _mm_slli_epi16(b, 1)); y = _mm_add_epi16(y, b); y = _mm_add_epi16(y, _mm_srai_epi16(b, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(b, 3)); y = _mm_add_epi16(y, _mm_srai_epi16(b, 6)); y = _mm_add_epi16(y, _mm_srai_epi16(b, 7)); y = _mm_add_epi16(y, min); _mm_between_epi16(y, min, max); _mm_store_si128(&y_r_buf[i], y); /* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) - ((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) + ((b << 4) + (b >> 6)); */ /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6)); cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2)); cb = _mm_sub_epi16(cb, r); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5)); cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3)); cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6)); _mm_between_epi16(cb, min, max); _mm_store_si128(&cb_g_buf[i], cb); /* cr = ((r << 4) - (r >> 7)) - ((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) - ((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */ /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7)); cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3)); cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2)); cr = _mm_sub_epi16(cr, g); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6)); cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7)); _mm_between_epi16(cr, min, max); _mm_store_si128(&cr_b_buf[i], cr); } }
void spu_interpreter::SHLHI(SPUThread& CPU, spu_opcode_t op) { CPU.GPR[op.rt].vi = _mm_slli_epi16(CPU.GPR[op.ra].vi, op.si7 & 0x1f); }
void spu_interpreter::ROTHI(SPUThread& CPU, spu_opcode_t op) { const auto a = CPU.GPR[op.ra].vi; const s32 n = op.si7 & 0xf; CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi16(a, n), _mm_srli_epi16(a, 16 - n)); }
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // We process eight columns (transposed rows in second pass) at a time. int column_start; for (column_start = 0; column_start < 16; column_start += 8) { __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i in08, in09, in10, in11, in12, in13, in14, in15; __m128i input0, input1, input2, input3, input4, input5, input6, input7; __m128i step1_0, step1_1, step1_2, step1_3; __m128i step1_4, step1_5, step1_6, step1_7; __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; __m128i step3_0, step3_1, step3_2, step3_3; __m128i step3_4, step3_5, step3_6, step3_7; __m128i res00, res01, res02, res03, res04, res05, res06, res07; __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); in02 = _mm_slli_epi16(in02, 2); in03 = _mm_slli_epi16(in03, 2); in04 = _mm_slli_epi16(in04, 2); in05 = _mm_slli_epi16(in05, 2); in06 = _mm_slli_epi16(in06, 2); in07 = _mm_slli_epi16(in07, 2); in08 = _mm_slli_epi16(in08, 2); in09 = _mm_slli_epi16(in09, 2); in10 = _mm_slli_epi16(in10, 2); in11 = _mm_slli_epi16(in11, 2); in12 = _mm_slli_epi16(in12, 2); in13 = _mm_slli_epi16(in13, 2); in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); in02 = _mm_add_epi16(in02, kOne); in03 = _mm_add_epi16(in03, kOne); in04 = _mm_add_epi16(in04, kOne); in05 = _mm_add_epi16(in05, kOne); in06 = _mm_add_epi16(in06, kOne); in07 = _mm_add_epi16(in07, kOne); in08 = _mm_add_epi16(in08, kOne); in09 = _mm_add_epi16(in09, kOne); in10 = _mm_add_epi16(in10, kOne); in11 = _mm_add_epi16(in11, kOne); in12 = _mm_add_epi16(in12, kOne); in13 = _mm_add_epi16(in13, kOne); in14 = _mm_add_epi16(in14, kOne); in15 = _mm_add_epi16(in15, kOne); in00 = _mm_srai_epi16(in00, 2); in01 = _mm_srai_epi16(in01, 2); in02 = _mm_srai_epi16(in02, 2); in03 = _mm_srai_epi16(in03, 2); in04 = _mm_srai_epi16(in04, 2); in05 = _mm_srai_epi16(in05, 2); in06 = _mm_srai_epi16(in06, 2); in07 = _mm_srai_epi16(in07, 2); in08 = _mm_srai_epi16(in08, 2); in09 = _mm_srai_epi16(in09, 2); in10 = _mm_srai_epi16(in10, 2); in11 = _mm_srai_epi16(in11, 2); in12 = _mm_srai_epi16(in12, 2); in13 = _mm_srai_epi16(in13, 2); in14 = _mm_srai_epi16(in14, 2); in15 = _mm_srai_epi16(in15, 2); } in += 8; // Calculate input for the first 8 results. { input0 = _mm_add_epi16(in00, in15); input1 = _mm_add_epi16(in01, in14); input2 = _mm_add_epi16(in02, in13); input3 = _mm_add_epi16(in03, in12); input4 = _mm_add_epi16(in04, in11); input5 = _mm_add_epi16(in05, in10); input6 = _mm_add_epi16(in06, in09); input7 = _mm_add_epi16(in07, in08); } // Calculate input for the next 8 results. { step1_0 = _mm_sub_epi16(in07, in08); step1_1 = _mm_sub_epi16(in06, in09); step1_2 = _mm_sub_epi16(in05, in10); step1_3 = _mm_sub_epi16(in04, in11); step1_4 = _mm_sub_epi16(in03, in12); step1_5 = _mm_sub_epi16(in02, in13); step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } // Work on the first eight values; fdct8_1d(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); const __m128i q3 = _mm_add_epi16(input3, input4); const __m128i q4 = _mm_sub_epi16(input3, input4); const __m128i q5 = _mm_sub_epi16(input2, input5); const __m128i q6 = _mm_sub_epi16(input1, input6); const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res00 = _mm_packs_epi32(w0, w1); res08 = _mm_packs_epi32(w2, w3); res04 = _mm_packs_epi32(w4, w5); res12 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res02 = _mm_packs_epi32(w0, w1); res14 = _mm_packs_epi32(w2, w3); res10 = _mm_packs_epi32(w4, w5); res06 = _mm_packs_epi32(w6, w7); } } // Work on the next eight values; step1 -> odd_results { // step 2 { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_2 = _mm_packs_epi32(w0, w1); step2_3 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_5 = _mm_packs_epi32(w0, w1); step2_4 = _mm_packs_epi32(w2, w3); } // step 3 { step3_0 = _mm_add_epi16(step1_0, step2_3); step3_1 = _mm_add_epi16(step1_1, step2_2); step3_2 = _mm_sub_epi16(step1_1, step2_2); step3_3 = _mm_sub_epi16(step1_0, step2_3); step3_4 = _mm_sub_epi16(step1_7, step2_4); step3_5 = _mm_sub_epi16(step1_6, step2_5); step3_6 = _mm_add_epi16(step1_6, step2_5); step3_7 = _mm_add_epi16(step1_7, step2_4); } // step 4 { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_1 = _mm_packs_epi32(w0, w1); step2_2 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_6 = _mm_packs_epi32(w0, w1); step2_5 = _mm_packs_epi32(w2, w3); } // step 5 { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); step1_2 = _mm_sub_epi16(step3_3, step2_2); step1_3 = _mm_add_epi16(step3_3, step2_2); step1_4 = _mm_add_epi16(step3_4, step2_5); step1_5 = _mm_sub_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } // step 6 { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res01 = _mm_packs_epi32(w0, w1); res09 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res05 = _mm_packs_epi32(w0, w1); res13 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res11 = _mm_packs_epi32(w0, w1); res03 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res15 = _mm_packs_epi32(w0, w1); res07 = _mm_packs_epi32(w2, w3); } } // Transpose the results, do it as two 8x8 transposes. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); } { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 // Store results _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); } out += 8*16; } // Setup in/out for next pass. in = intermediate; out = output; } }
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); __m128i in0, in1, in2, in3; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); in2 = _mm_slli_epi16(in2, 4); in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); in0 = _mm_add_epi16(in0, mask); in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); const __m128i r3 = _mm_sub_epi16(in0, in3); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); // Combine and transpose const __m128i res0 = _mm_packs_epi32(w0, w2); const __m128i res1 = _mm_packs_epi32(w4, w6); // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 if (0 == pass) { // Extract values in the high part for second pass as transform code // only uses the first four values. in1 = _mm_unpackhi_epi64(in0, in0); in3 = _mm_unpackhi_epi64(in2, in2); } else { // Post-condition output and store it (v + 1) >> 2, taking advantage // of the fact 1/3 are stored just after 0/2. __m128i out01 = _mm_add_epi16(in0, kOne); __m128i out23 = _mm_add_epi16(in2, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } } }
void ulsch_channel_compensation(int **rxdataF_ext, int **ul_ch_estimates_ext, int **ul_ch_mag, int **ul_ch_magb, int **rxdataF_comp, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift) { unsigned short rb; __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[0] = ul_ch_mag128[0]; ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U); ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[1] = ul_ch_mag128[1]; ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U); ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2); // 2 to compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b[2] = ul_ch_mag128[2]; ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U); ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU); ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU); ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate } #else mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2])); #endif // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128[0]); // print_shorts("pack:",rxdataF_comp128[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128[1]); // print_shorts("pack:",rxdataF_comp128[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128[2]); // print_shorts("pack:",rxdataF_comp128[2]); ul_ch128+=3; ul_ch_mag128+=3; ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; } } _mm_empty(); _m_empty(); }
void ulsch_channel_compensation_alamouti(int **rxdataF_ext, // For Distributed Alamouti Combining int **ul_ch_estimates_ext_0, int **ul_ch_estimates_ext_1, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, int **rxdataF_comp_0, int **rxdataF_comp_1, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift_0, unsigned char output_shift_1) { unsigned short rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) { QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1); QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1); } else if (Qm == 6) { QAM_amp128U_0 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2); QAM_amp128U_1 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128_0 = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_0 = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch128_1 = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_1 = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_0 = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[0] = ul_ch_mag128_0[0]; ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0); ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[1] = ul_ch_mag128_0[1]; ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0); ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_0[2] = ul_ch_mag128_0[2]; ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0); ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0); ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0); ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0); ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2); // 2 to scale compensate the scale channel estima mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[0] = ul_ch_mag128_1[0]; ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1); ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[1] = ul_ch_mag128_1[1]; ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1); ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_1[2] = ul_ch_mag128_1[2]; ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0); ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1); ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1); ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1); ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2); // 2 to scale compensate the scale channel estima } /************************For Computing (y)*(h0*)********************************************/ // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128_0[0]); // print_shorts("pack:",rxdataF_comp128_0[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128_0[1]); // print_shorts("pack:",rxdataF_comp128_0[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_0[2]); /*************************For Computing (y*)*(h1)************************************/ // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch_conjugate:",ul_ch128_1[0]); // print_shorts("pack:",rxdataF_comp128_1[0]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch_conjugate:",ul_ch128_1[1]); // print_shorts("pack:",rxdataF_comp128_1[1]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch_conjugate:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_1[2]); ul_ch128_0+=3; ul_ch_mag128_0+=3; ul_ch_mag128b_0+=3; ul_ch128_1+=3; ul_ch_mag128_1+=3; ul_ch_mag128b_1+=3; rxdataF128+=3; rxdataF_comp128_0+=3; rxdataF_comp128_1+=3; } } _mm_empty(); _m_empty(); }
static inline void fm10k_desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1, eflag0, eflag1, cksumflag; union { uint16_t e[4]; uint64_t dword; } vol; const __m128i pkttype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT); /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* mask for HBO and RXE flag flags */ const __m128i rxe_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0001, 0x0001, 0x0001); /* mask the lower byte of ol_flags */ const __m128i ol_flags_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x00FF, 0x00FF, 0x00FF, 0x00FF); const __m128i l3l4cksum_flag = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> CKSUM_SHIFT, (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> CKSUM_SHIFT, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> CKSUM_SHIFT, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> CKSUM_SHIFT); const __m128i rxe_flag = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* Calculate RSS_hash and Vlan fields */ ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); eflag0 = vtag1; cksumflag = vtag1; vtag1 = _mm_srli_epi16(vtag1, VP_SHIFT); vtag1 = _mm_and_si128(vtag1, pkttype_msk); vtag1 = _mm_or_si128(ptype0, vtag1); /* Process err flags, simply set RECIP_ERR bit if HBO/IXE is set */ eflag1 = _mm_srli_epi16(eflag0, RXEFLAG_SHIFT); eflag0 = _mm_srli_epi16(eflag0, HBOFLAG_SHIFT); eflag0 = _mm_or_si128(eflag0, eflag1); eflag0 = _mm_and_si128(eflag0, rxe_msk); eflag0 = _mm_shuffle_epi8(rxe_flag, eflag0); vtag1 = _mm_or_si128(eflag0, vtag1); /* Process L4/L3 checksum error flags */ cksumflag = _mm_srli_epi16(cksumflag, L3L4EFLAG_SHIFT); cksumflag = _mm_shuffle_epi8(l3l4cksum_flag, cksumflag); /* clean the higher byte and shift back the flag bits */ cksumflag = _mm_and_si128(cksumflag, ol_flags_msk); cksumflag = _mm_slli_epi16(cksumflag, CKSUM_SHIFT); vtag1 = _mm_or_si128(cksumflag, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
static INLINE __m128i highbd_max_epi16(int bd) { const __m128i neg_one = _mm_set1_epi16(-1); // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); }
static inline __m128i byteswap16( __m128i v ) { //rotate each 16 bit quantity by 8 bits return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); }
mlib_status mlib_VideoColorBGR2JFIFYCC444_S16_naligned( mlib_s16 *y, mlib_s16 *cb, mlib_s16 *cr, const mlib_s16 *bgr, mlib_s32 n) { /* 0.299*32768 */ const __m128i x_c11 = _mm_set1_epi16(9798); /* 0.587*32768 */ const __m128i x_c12 = _mm_set1_epi16(19235); /* 0.114*32768 */ const __m128i x_c13 = _mm_set1_epi16(3735); /* -0.16874*32768 */ const __m128i x_c21 = _mm_set1_epi16(-5529); /* -0.33126*32768 */ const __m128i x_c22 = _mm_set1_epi16(-10855); /* 0.5*32768 */ const __m128i x_c23 = _mm_set1_epi16(16384); /* 0.5*32768 */ const __m128i x_c31 = x_c23; /* -0.41869*32768 */ const __m128i x_c32 = _mm_set1_epi16(-13720); /* -0.08131*32768 */ const __m128i x_c33 = _mm_set1_epi16(-2664); /* 2048 */ const __m128i x_coff = _mm_set1_epi16(2048 << 2); const __m128i x_zero = _mm_setzero_si128(); __m128i x_bgr0, x_bgr1, x_bgr2, x_r, x_g, x_b; __m128i x_y, x_cb, x_cr; __m128i x_t0, x_t1, x_t2, x_t3, x_t4, x_t5; __m128i *px_y, *px_cb, *px_cr, *px_bgr; mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; px_bgr = (__m128i *)bgr; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (n - 8); i += 8) { x_bgr0 = _mm_loadu_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_loadu_si128(px_bgr++); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); x_bgr2 = _mm_loadu_si128(px_bgr++); x_bgr2 = _mm_slli_epi16(x_bgr2, 3); SeparateBGR48_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_storeu_si128(px_y++, x_y); _mm_storeu_si128(px_cb++, x_cb); _mm_storeu_si128(px_cr++, x_cr); } if (i <= (n - 4)) { x_bgr0 = _mm_loadu_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_loadl_epi64(px_bgr); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); px_bgr = (__m128i *)((__m64 *)px_bgr + 1); SeparateBGR24_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_storel_epi64(px_y, x_y); px_y = (__m128i *)((__m64 *)px_y + 1); _mm_storel_epi64(px_cb, x_cb); px_cb = (__m128i *)((__m64 *)px_cb + 1); _mm_storel_epi64(px_cr, x_cr); px_cr = (__m128i *)((__m64 *)px_cr + 1); i += 4; } for (; i <= (n - 1); i++) { fb = bgr[3 * i]; fg = bgr[3 * i + 1]; fr = bgr[3 * i + 2]; fy = 0.29900f * fr + 0.58700f * fg + 0.11400f * fb; fcb = -0.16874f * fr - 0.33126f * fg + 0.50000f * fb + 2048; fcr = 0.50000f * fr - 0.41869f * fg - 0.08131f * fb + 2048; y[i] = (mlib_s16)fy; cb[i] = (mlib_s16)fcb; cr[i] = (mlib_s16)fcr; } return (MLIB_SUCCESS); }
/** * Calculate output of given chromosome and inputs using SSE instructions * @param chr * @param inputs * @param outputs */ void cgp_get_output_sse(ga_chr_t chromosome, __m128i_aligned inputs[CGP_INPUTS], __m128i_aligned outputs[CGP_OUTPUTS]) { #ifdef SSE2 assert(CGP_OUTPUTS == 1); assert(CGP_ROWS == 4); assert(CGP_LBACK == 1); // previous and currently computed column register __m128i prev0, prev1, prev2, prev3; register __m128i current0, current1, current2, current3; // 0xFF constant static __m128i_aligned FF; FF = _mm_set1_epi8(0xFF); cgp_genome_t genome = (cgp_genome_t) chromosome->genome; /* if primary output is connected to primary input, skip evaluation This cannot happen - CGP does not generate circuits like that if (genome->outputs[0] < CGP_INPUTS) { int i = genome->outputs[0]; _mm_store_si128(&outputs[0], inputs[i]); return; } */ #ifdef TEST_EVAL_SSE2 for (int i = 0; i < CGP_INPUTS; i++) { unsigned char *_tmp = (unsigned char*) &inputs[i]; printf("I: %2d = " UCFMT16 "\n", i, UCVAL16(0)); } #endif int offset = -CGP_ROWS; for (int x = 0; x < CGP_COLS; x++) { for (int y = 0; y < CGP_ROWS; y++) { int idx = cgp_node_index(x, y); cgp_node_t *n = &(genome->nodes[idx]); // skip inactive blocks if (!n->is_active) continue; register __m128i A; register __m128i B; register __m128i Y; register __m128i TMP; register __m128i mask; LOAD_INPUT(A, n->inputs[0]); LOAD_INPUT(B, n->inputs[1]); switch (n->function) { case c255: Y = FF; break; case identity: Y = A; break; case inversion: Y = _mm_sub_epi8(FF, A); break; case b_or: Y = _mm_or_si128(A, B); break; case b_not1or2: // we don't have NOT instruction, we need to XOR with FF Y = _mm_xor_si128(FF, A); Y = _mm_or_si128(Y, B); break; case b_and: Y = _mm_and_si128(A, B); break; case b_nand: Y = _mm_and_si128(A, B); Y = _mm_xor_si128(FF, Y); break; case b_xor: Y = _mm_xor_si128(A, B); break; case rshift1: // no SR instruction for 8bit data, we need to shift // 16 bits and apply mask // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHR: [ 0 1 2 3 4 5 6 7 | 8 A B C D E F G] // MSK: [ 0 1 2 3 4 5 6 7 | 0 A B C D E F G] mask = _mm_set1_epi8(0x7F); Y = _mm_srli_epi16(A, 1); Y = _mm_and_si128(Y, mask); break; case rshift2: // similar to rshift1 // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHR: [ 0 0 1 2 3 4 5 6 | 7 8 A B C D E F] // MSK: [ 0 0 1 2 3 4 5 6 | 0 0 A B C D E F] mask = _mm_set1_epi8(0x3F); Y = _mm_srli_epi16(A, 2); Y = _mm_and_si128(Y, mask); break; case swap: // SWAP(A, B) (((A & 0x0F) << 4) | ((B & 0x0F))) // Shift A left by 4 bits // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // SHL: [ 5 6 7 8 A B C D | E F G H 0 0 0 0] // MSK: [ 5 6 7 8 0 0 0 0 | E F G H 0 0 0 0] mask = _mm_set1_epi8(0xF0); TMP = _mm_slli_epi16(A, 4); TMP = _mm_and_si128(TMP, mask); // Mask B // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H] // MSK: [ 0 0 0 0 5 6 7 8 | 0 0 0 0 E F G H] mask = _mm_set1_epi8(0x0F); Y = _mm_and_si128(B, mask); // Combine Y = _mm_or_si128(Y, TMP); break; case add: Y = _mm_add_epi8(A, B); break; case add_sat: Y = _mm_adds_epu8(A, B); break; case avg: // shift right first, then add, to avoid overflow mask = _mm_set1_epi8(0x7F); TMP = _mm_srli_epi16(A, 1); TMP = _mm_and_si128(TMP, mask); Y = _mm_srli_epi16(B, 1); Y = _mm_and_si128(Y, mask); Y = _mm_add_epi8(Y, TMP); break; case max: Y = _mm_max_epu8(A, B); break; case min: Y = _mm_min_epu8(A, B); break; } #ifdef TEST_EVAL_SSE2 __m128i _tmpval = Y; unsigned char *_tmp = (unsigned char*) &_tmpval; printf("N: %2d = " UCFMT16 "\n", idx + CGP_INPUTS, UCVAL16(0)); bool mismatch = false; for (int i = 1; i < 16; i++) { if (_tmp[i] != _tmp[0]) { fprintf(stderr, "Value mismatch on index %2d (%u instead of %u)\n", i, _tmp[i], _tmp[0]); mismatch = true; } } if (mismatch) { abort(); } #endif if (idx + CGP_INPUTS == genome->outputs[0]) { _mm_store_si128(&outputs[0], Y); #ifndef TEST_EVAL_SSE2 return; #endif } ASSIGN_CURRENT(y, Y); } // end of column offset += CGP_ROWS; prev0 = current0; prev1 = current1; prev2 = current2; prev3 = current3; } // end of row #ifdef TEST_EVAL_SSE2 for (int i = 0; i < CGP_OUTPUTS; i++) { unsigned char *_tmp = (unsigned char*) &outputs[i]; printf("O: %2d = " UCFMT16 "\n", i, UCVAL16(0)); } #endif #endif }
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int y, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; } } }
static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) { int SMAG = 1 << SMAGL; // calculate displacement __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0))); __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0))); __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1)); __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1)); above = _mm_unpacklo_epi8(above, zero); below = _mm_unpacklo_epi8(below, zero); left = _mm_unpacklo_epi8(left, zero); right = _mm_unpacklo_epi8(right, zero); __m128i h = _mm_sub_epi16(left, right); __m128i v = _mm_sub_epi16(above, below); h = _mm_slli_epi16(h, 7); v = _mm_slli_epi16(v, 7); h = _mm_mulhi_epi16(h, depth); v = _mm_mulhi_epi16(v, depth); v = _mm_max_epi16(v, y_limit_min); v = _mm_min_epi16(v, y_limit_max); __m128i remainder_h = h; __m128i remainder_v = v; if (SMAGL) { remainder_h = _mm_slli_epi16(remainder_h, SMAGL); remainder_v = _mm_slli_epi16(remainder_v, SMAGL); } remainder_h = _mm_and_si128(remainder_h, word_127); remainder_v = _mm_and_si128(remainder_v, word_127); h = _mm_srai_epi16(h, 7 - SMAGL); v = _mm_srai_epi16(v, 7 - SMAGL); __m128i xx = _mm_set1_epi32(x << SMAGL); xx = _mm_packs_epi32(xx, xx); h = _mm_adds_epi16(h, xx); remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h)); remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h); h = _mm_max_epi16(h, x_limit_min); h = _mm_min_epi16(h, x_limit_max); // h and v contain the displacement now. __m128i disp_lo = _mm_unpacklo_epi16(v, h); __m128i disp_hi = _mm_unpackhi_epi16(v, h); disp_lo = _mm_madd_epi16(disp_lo, one_stride); disp_hi = _mm_madd_epi16(disp_hi, one_stride); __m128i line0 = _mm_setzero_si128(); __m128i line1 = _mm_setzero_si128(); int offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7); __m128i left0 = _mm_and_si128(line0, word_255); __m128i left1 = _mm_and_si128(line1, word_255); __m128i right0 = _mm_srli_epi16(line0, 8); __m128i right1 = _mm_srli_epi16(line1, 8); left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h)); left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h)); right0 = _mm_mullo_epi16(right0, remainder_h); right1 = _mm_mullo_epi16(right1, remainder_h); line0 = _mm_add_epi16(left0, right0); line1 = _mm_add_epi16(left1, right1); line0 = _mm_add_epi16(line0, word_64); line1 = _mm_add_epi16(line1, word_64); line0 = _mm_srai_epi16(line0, 7); line1 = _mm_srai_epi16(line1, 7); line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v)); line1 = _mm_mullo_epi16(line1, remainder_v); __m128i result = _mm_add_epi16(line0, line1); result = _mm_add_epi16(result, word_64); result = _mm_srai_epi16(result, 7); result = _mm_packus_epi16(result, result); _mm_storel_epi64((__m128i *)(dstp + x), result); }
/** * @brief mux all audio ports to events * @param data * @param offset * @param nevents */ void AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data, unsigned int offset, unsigned int nevents) { unsigned int j; quadlet_t *target_event; int i; float * client_buffers[4]; float tmp_values[4] __attribute__ ((aligned (16))); uint32_t tmp_values_int[4] __attribute__ ((aligned (16))); // prepare the scratch buffer assert(m_scratch_buffer_size_bytes > nevents * 4); memset(m_scratch_buffer, 0, nevents * 4); const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000); const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF); const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER); #if AMDTP_CLIP_FLOATS const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0); const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0); #endif // this assumes that audio ports are sorted by position, // and that there are no gaps for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) { struct _MBLA_port_cache *p; // get the port buffers for (j=0; j<4; j++) { p = &(m_audio_ports.at(i+j)); if(likely(p->buffer && p->enabled)) { client_buffers[j] = (float *) p->buffer; client_buffers[j] += offset; } else { // if a port is disabled or has no valid // buffer, use the scratch buffer (all zero's) client_buffers[j] = (float *) m_scratch_buffer; } } // the base event for this position target_event = (quadlet_t *)(data + i); // process the events for (j=0;j < nevents; j += 1) { // read the values tmp_values[0] = *(client_buffers[0]); tmp_values[1] = *(client_buffers[1]); tmp_values[2] = *(client_buffers[2]); tmp_values[3] = *(client_buffers[3]); // now do the SSE based conversion/labeling __m128 v_float = *((__m128*)tmp_values); __m128i *target = (__m128i*)target_event; __m128i v_int; // clip #if AMDTP_CLIP_FLOATS // do SSE clipping v_float = _mm_max_ps(v_float, v_min); v_float = _mm_min_ps(v_float, v_max); #endif // multiply v_float = _mm_mul_ps(v_float, mult); // convert to signed integer v_int = _mm_cvttps_epi32( v_float ); // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int // (target misalignment is assumed since we don't know the m_dimension) _mm_storeu_si128 (target, v_int); // increment the buffer pointers client_buffers[0]++; client_buffers[1]++; client_buffers[2]++; client_buffers[3]++; // go to next target event position target_event += m_dimension; } } // do remaining ports // NOTE: these can be time-SSE'd for (; i < (int)m_nb_audio_ports; i++) { struct _MBLA_port_cache &p = m_audio_ports.at(i); target_event = (quadlet_t *)(data + i); #ifdef DEBUG assert(nevents + offset <= p.buffer_size ); #endif if(likely(p.buffer && p.enabled)) { float *buffer = (float *)(p.buffer); buffer += offset; for (j = 0;j < nevents; j += 4) { // read the values tmp_values[0] = *buffer; buffer++; tmp_values[1] = *buffer; buffer++; tmp_values[2] = *buffer; buffer++; tmp_values[3] = *buffer; buffer++; // now do the SSE based conversion/labeling __m128 v_float = *((__m128*)tmp_values); __m128i v_int; #if AMDTP_CLIP_FLOATS // do SSE clipping v_float = _mm_max_ps(v_float, v_min); v_float = _mm_min_ps(v_float, v_max); #endif // multiply v_float = _mm_mul_ps(v_float, mult); // convert to signed integer v_int = _mm_cvttps_epi32( v_float ); // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int); // increment the buffer pointers *target_event = tmp_values_int[0]; target_event += m_dimension; *target_event = tmp_values_int[1]; target_event += m_dimension; *target_event = tmp_values_int[2]; target_event += m_dimension; *target_event = tmp_values_int[3]; target_event += m_dimension; } // do the remainder of the events for(;j < nevents; j += 1) { float *in = (float *)buffer; #if AMDTP_CLIP_FLOATS // clip directly to the value of a maxed event if(unlikely(*in > 1.0)) { *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF); } else if(unlikely(*in < -1.0)) { *target_event = CONDSWAPTOBUS32_CONST(0x40800001); } else { float v = (*in) * AMDTP_FLOAT_MULTIPLIER; unsigned int tmp = ((int) v); tmp = ( tmp & 0x00FFFFFF ) | 0x40000000; *target_event = CondSwapToBus32((quadlet_t)tmp); } #else float v = (*in) * AMDTP_FLOAT_MULTIPLIER; unsigned int tmp = ((int) v); tmp = ( tmp & 0x00FFFFFF ) | 0x40000000; *target_event = CondSwapToBus32((quadlet_t)tmp); #endif buffer++; target_event += m_dimension; } } else { for (j = 0;j < nevents; j += 1) { // hardcoded byte swapped *target_event = 0x00000040; target_event += m_dimension; } } } }
static void warp_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int dst_stride, int width, int height, int depth_scalar) { int SMAG = 1 << SMAGL; __m128i depth = _mm_set1_epi32(depth_scalar << 8); depth = _mm_packs_epi32(depth, depth); const int16_t x_limit_min_array[8] = { (int16_t)(0 * SMAG), (int16_t)(-1 * SMAG), (int16_t)(-2 * SMAG), (int16_t)(-3 * SMAG), (int16_t)(-4 * SMAG), (int16_t)(-5 * SMAG), (int16_t)(-6 * SMAG), (int16_t)(-7 * SMAG) }; const int16_t x_limit_max_array[8] = { (int16_t)((width - 1) * SMAG), (int16_t)((width - 2) * SMAG), (int16_t)((width - 3) * SMAG), (int16_t)((width - 4) * SMAG), (int16_t)((width - 5) * SMAG), (int16_t)((width - 6) * SMAG), (int16_t)((width - 7) * SMAG), (int16_t)((width - 8) * SMAG) }; __m128i x_limit_min = _mm_loadu_si128((const __m128i *)x_limit_min_array); __m128i x_limit_max = _mm_loadu_si128((const __m128i *)x_limit_max_array); int width_sse2 = (width & ~7) + 2; if (width_sse2 > dst_stride) width_sse2 -= 8; __m128i zero = _mm_setzero_si128(); __m128i word_255 = _mm_setzero_si128(); word_255 = _mm_cmpeq_epi16(word_255, word_255); word_255 = _mm_srli_epi16(word_255, 8); __m128i word_127 = _mm_setzero_si128(); word_127 = _mm_cmpeq_epi16(word_127, word_127); word_127 = _mm_srli_epi16(word_127, 9); __m128i word_1 = _mm_setzero_si128(); word_1 = _mm_cmpeq_epi16(word_1, word_1); word_1 = _mm_srli_epi16(word_1, 15); __m128i one_stride = _mm_unpacklo_epi16(_mm_set1_epi16(src_stride), word_1); __m128i word_128 = _mm_setzero_si128(); word_128 = _mm_cmpeq_epi16(word_128, word_128); word_128 = _mm_slli_epi16(word_128, 15); word_128 = _mm_srli_epi16(word_128, 8); __m128i word_64 = _mm_setzero_si128(); word_64 = _mm_cmpeq_epi16(word_64, word_64); word_64 = _mm_slli_epi16(word_64, 15); word_64 = _mm_srli_epi16(word_64, 9); for (int y = 0; y < height; y++) { __m128i y_limit_min = _mm_set1_epi32(-y * 128); __m128i y_limit_max = _mm_set1_epi32((height - y) * 128 - 129); // (height - y - 1) * 128 - 1 y_limit_min = _mm_packs_epi32(y_limit_min, y_limit_min); y_limit_max = _mm_packs_epi32(y_limit_max, y_limit_max); warp_edge_c<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, width, height, 0, y, depth_scalar); for (int x = 1; x < width_sse2 - 1; x += 8) warp_mmword_u8_sse2<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, height, x, y, depth, zero, x_limit_min, x_limit_max, y_limit_min, y_limit_max, word_64, word_127, word_128, word_255, one_stride); if (width + 2 > width_sse2) warp_mmword_u8_sse2<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, height, width - 9, y, depth, zero, x_limit_min, x_limit_max, y_limit_min, y_limit_max, word_64, word_127, word_128, word_255, one_stride); warp_edge_c<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, width, height, width - 1, y, depth_scalar); srcp += src_stride * SMAG; edgep += edge_stride; dstp += dst_stride; } }
int operator()(int** src, uchar* dst, int, int width) const { if( !checkHardwareSupport(CV_CPU_SSE2) ) return 0; int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; __m128i delta = _mm_set1_epi16(128); for( ; x <= width - 16; x += 16 ) { __m128i r0, r1, r2, r3, r4, t0, t1; r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), _mm_load_si128((const __m128i*)(row0 + x + 4))); r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), _mm_load_si128((const __m128i*)(row1 + x + 4))); r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), _mm_load_si128((const __m128i*)(row2 + x + 4))); r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), _mm_load_si128((const __m128i*)(row3 + x + 4))); r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), _mm_load_si128((const __m128i*)(row4 + x + 4))); r0 = _mm_add_epi16(r0, r4); r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)), _mm_load_si128((const __m128i*)(row0 + x + 12))); r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)), _mm_load_si128((const __m128i*)(row1 + x + 12))); r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)), _mm_load_si128((const __m128i*)(row2 + x + 12))); r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)), _mm_load_si128((const __m128i*)(row3 + x + 12))); r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)), _mm_load_si128((const __m128i*)(row4 + x + 12))); r0 = _mm_add_epi16(r0, r4); r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8); t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8); _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1)); } for( ; x <= width - 4; x += 4 ) { __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128(); r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z); r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z); r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z); r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z); r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z); r0 = _mm_add_epi16(r0, r4); r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8); *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0)); } return x; }
int smith_waterman_sse2_word(const unsigned char * query_sequence, unsigned short * query_profile_word, const int query_length, const unsigned char * db_sequence, const int db_length, unsigned short gap_open, unsigned short gap_extend, struct f_struct * f_str) { int i, j, k; short score; int cmp; int iter = (query_length + 7) / 8; __m128i *p; __m128i *workspace = (__m128i *) f_str->workspace; __m128i E, F, H; __m128i v_maxscore; __m128i v_gapopen; __m128i v_gapextend; __m128i v_min; __m128i v_minimums; __m128i v_temp; __m128i *pHLoad, *pHStore; __m128i *pE; __m128i *pScore; /* Load gap opening penalty to all elements of a constant */ v_gapopen = _mm_setzero_si128(); /* Apple Devel */ v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0); v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0); v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0); /* Load gap extension penalty to all elements of a constant */ v_gapextend = _mm_setzero_si128(); /* Apple Devel */ v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0); v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0); v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0); /* load v_maxscore with the zeros. since we are using signed */ /* math, we will bias the maxscore to -32768 so we have the */ /* full range of the short. */ v_maxscore = _mm_setzero_si128(); /* Apple Devel */ v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore); v_maxscore = _mm_slli_epi16 (v_maxscore, 15); v_minimums = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_srli_si128 (v_min, 14); /* Zero out the storage vector */ k = 2 * iter; p = workspace; for (i = 0; i < k; i++) { _mm_store_si128 (p++, v_maxscore); } pE = workspace; pHStore = pE + iter; pHLoad = pHStore + iter; for (i = 0; i < db_length; ++i) { /* fetch first data asap. */ pScore = (__m128i *) query_profile_word + db_sequence[i] * iter; /* bias all elements in F to -32768 */ F = _mm_setzero_si128(); /* Apple Devel */ F = _mm_cmpeq_epi16 (F, F); F = _mm_slli_epi16 (F, 15); /* load the next h value */ H = _mm_load_si128 (pHStore + iter - 1); H = _mm_slli_si128 (H, 2); H = _mm_or_si128 (H, v_min); p = pHLoad; pHLoad = pHStore; pHStore = p; for (j = 0; j < iter; j++) { /* load E values */ E = _mm_load_si128 (pE + j); /* add score to H */ H = _mm_adds_epi16 (H, *pScore++); /* Update highest score encountered this far */ v_maxscore = _mm_max_epi16 (v_maxscore, H); /* get max from H, E and F */ H = _mm_max_epi16 (H, E); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* subtract the gap open penalty from H */ H = _mm_subs_epi16 (H, v_gapopen); /* update E value */ E = _mm_subs_epi16 (E, v_gapextend); E = _mm_max_epi16 (E, H); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); F = _mm_max_epi16 (F, H); /* save E values */ _mm_store_si128 (pE + j, E); /* load the next h value */ H = _mm_load_si128 (pHLoad + j); } /* reset pointers to the start of the saved data */ j = 0; H = _mm_load_si128 (pHStore + j); /* the computed F value is for the given column. since */ /* we are at the end, we need to shift the F value over */ /* to the next column. */ F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); while (cmp != 0x0000) { E = _mm_load_si128 (pE + j); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* update E in case the new H value would change it */ H = _mm_subs_epi16 (H, v_gapopen); E = _mm_max_epi16 (E, H); _mm_store_si128 (pE + j, E); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); j++; if (j >= iter) { j = 0; F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); } H = _mm_load_si128 (pHStore + j); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); } } /* find largest score in the v_maxscore vector */ v_temp = _mm_srli_si128 (v_maxscore, 8); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 4); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 2); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); /* extract the largest score */ score = _mm_extract_epi16 (v_maxscore, 0); /* return largest score biased by 32768 */ /* fix for Mac OSX clang 4.1 */ /* #ifdef __clang__ if (score < 0) score += 32768; return score; #else */ return score + 32768; /* #endif */ }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i three = _mm_set1_epi16(3); // Load, combine and tranpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); // b0_extra = (a0 != 0); const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); const __m128i b0_base = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // b = abs(b) + 3 A_b0 = _mm_add_epi16(A_b0, three); A_b2 = _mm_add_epi16(A_b2, three); B_b0 = _mm_add_epi16(B_b0, three); B_b2 = _mm_add_epi16(B_b2, three); // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 // b = (abs(b) + 3) >> 3 A_b0 = _mm_srai_epi16(A_b0, 3); A_b2 = _mm_srai_epi16(A_b2, 3); B_b0 = _mm_srai_epi16(B_b0, 3); B_b2 = _mm_srai_epi16(B_b2, 3); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point * numbers. See the general code above. */ PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b; __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf; int srcbump, dstbump, yp, imax; if (((ULONG_PTR) (pSrc[0]) & 0x0f) || ((ULONG_PTR) (pSrc[1]) & 0x0f) || ((ULONG_PTR) (pSrc[2]) & 0x0f) || ((ULONG_PTR) (pDst[0]) & 0x0f) || ((ULONG_PTR) (pDst[1]) & 0x0f) || ((ULONG_PTR) (pDst[2]) & 0x0f) || (roi->width & 0x07) || (srcStep & 127) || (dstStep & 127)) { /* We can't maintain 16-byte alignment. */ return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi); } min = _mm_set1_epi16(-128 << 5); max = _mm_set1_epi16(127 << 5); r_buf = (__m128i*) (pSrc[0]); g_buf = (__m128i*) (pSrc[1]); b_buf = (__m128i*) (pSrc[2]); y_buf = (__m128i*) (pDst[0]); cb_buf = (__m128i*) (pDst[1]); cr_buf = (__m128i*) (pDst[2]); y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */ y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */ y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */ cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */ cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */ cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */ cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */ cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */ cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */ srcbump = srcStep / sizeof(__m128i); dstbump = dstStep / sizeof(__m128i); #ifdef DO_PREFETCH /* Prefetch RGB's. */ for (yp=0; yp<roi->height; yp++) { int i; for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA); } r_buf += srcbump; g_buf += srcbump; b_buf += srcbump; } r_buf = (__m128i*) (pSrc[0]); g_buf = (__m128i*) (pSrc[1]); b_buf = (__m128i*) (pSrc[2]); #endif /* DO_PREFETCH */ imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use SSE2 signed 16-bit integer multiplication we * need to convert the floating point factors to signed int * without loosing information. The result of this multiplication * is 32 bit and using SSE2 we get either the product's hi or lo * word. Thus we will multiply the factors by the highest * possible 2^n and take the upper 16 bits of the signed 32-bit * result (_mm_mulhi_epi16). Since the final result needs to * be scaled by << 5 and also in in order to keep the precision * within the upper 16 bits we will also have to scale the RGB * values used in the multiplication by << 5+(16-n). */ __m128i r, g, b, y, cb, cr; r = _mm_load_si128(y_buf+i); g = _mm_load_si128(g_buf+i); b = _mm_load_si128(b_buf+i); /* r<<6; g<<6; b<<6 */ r = _mm_slli_epi16(r, 6); g = _mm_slli_epi16(g, 6); b = _mm_slli_epi16(b, 6); /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */ y = _mm_mulhi_epi16(r, y_r); y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g)); y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b)); y = _mm_add_epi16(y, min); /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ _mm_between_epi16(y, min, max); _mm_store_si128(y_buf+i, y); /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */ cb = _mm_mulhi_epi16(r, cb_r); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g)); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b)); /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ _mm_between_epi16(cb, min, max); _mm_store_si128(cb_buf+i, cb); /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */ cr = _mm_mulhi_epi16(r, cr_r); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g)); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b)); /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ _mm_between_epi16(cr, min, max); _mm_store_si128(cr_buf+i, cr); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }