void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) { int j; __m128i vecX, vecX0, vecX1, vecX2, vecX3; __m128i vecY0, vecY1, vecY2, vecY3; __m128i sum0, sum1, sum2, sum3, vecSum; __m128i initSum; celt_assert(len >= 3); sum0 = _mm_setzero_si128(); sum1 = _mm_setzero_si128(); sum2 = _mm_setzero_si128(); sum3 = _mm_setzero_si128(); for (j=0;j<(len-7);j+=8) { vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); } sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), _mm_unpacklo_epi32(sum2, sum3)); for (;j<(len-3);j+=4) { vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); vecX0 = _mm_shuffle_epi32(vecX, 0x00); vecX1 = _mm_shuffle_epi32(vecX, 0x55); vecX2 = _mm_shuffle_epi32(vecX, 0xaa); vecX3 = _mm_shuffle_epi32(vecX, 0xff); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); sum0 = _mm_mullo_epi32(vecX0, vecY0); sum1 = _mm_mullo_epi32(vecX1, vecY1); sum2 = _mm_mullo_epi32(vecX2, vecY2); sum3 = _mm_mullo_epi32(vecX3, vecY3); sum0 = _mm_add_epi32(sum0, sum1); sum2 = _mm_add_epi32(sum2, sum3); vecSum = _mm_add_epi32(vecSum, sum0); vecSum = _mm_add_epi32(vecSum, sum2); } for (;j<len;j++) { vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); vecX0 = _mm_shuffle_epi32(vecX, 0x00); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); sum0 = _mm_mullo_epi32(vecX0, vecY0); vecSum = _mm_add_epi32(vecSum, sum0); } initSum = _mm_loadu_si128((__m128i *)(&sum[0])); initSum = _mm_add_epi32(initSum, vecSum); _mm_storeu_si128((__m128i *)sum, initSum); }
inline void Cryptor::expandKey192(const unsigned char *key, unsigned char *schedule) { __m128i *keySchedule = (__m128i*) schedule; // Save the first 128 bits of the key as the first one. __m128i tmp = _mm_loadu_si128((__m128i*) key); if (!bigEndian) { reverse_m128i(tmp); // swap byte-order => big-endian. } keySchedule[0] = tmp; // The next 64 bits as the second. unsigned char buf[128]; memset(buf, 0, 128); memcpy(buf, key + 16, 64); __m128i tmp3 = _mm_loadu_si128((__m128i*) buf); if (!bigEndian) { reverse_m128i(tmp3); // swap byte-order => big-endian. } keySchedule[1] = tmp3; __m128i tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x1); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[1] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[1], (__m128d) tmp, 0); keySchedule[2] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x2); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[3] = tmp; keySchedule[4] = tmp3; tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x4); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[4] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[4], (__m128d) tmp, 0); keySchedule[5] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x8); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[6] = tmp; keySchedule[7] = tmp3; tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[7] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[7], (__m128d) tmp, 0); keySchedule[8] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[9] = tmp; keySchedule[10] = tmp3; tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[10] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[10], (__m128d) tmp, 0); keySchedule[11] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x80); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[12] = tmp; keySchedule[13] = tmp3; }
void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 8 bytes _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += output_pitch; } }
void aom_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter into the first lane firstFilters = _mm_shufflelo_epi16(filtersReg, 0); // duplicate only the third 16 bit in the filter into the first lane secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); // duplicate only the seconds 16 bits in the filter into the second lane // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); // duplicate only the forth 16 bits in the filter into the second lane // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // extract the higher half of the lane srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); // add and saturate all the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); output_ptr += output_pitch; } }
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12; const __m128i* xPtr = (const __m128i*) input; const __m128i* lutPtr = (const __m128i*) deinter; __m128i xVal, lutVal1, lutVal2; /* Simplify load if we do not need to wrap (ie high rates) */ if (in_len <= out_len) { for (int i=0;i<in_len/16;i++) { xVal = _mm_loadu_si128(xPtr); xPtr ++; lutVal1 = _mm_loadu_si128(lutPtr); lutPtr++; lutVal2 = _mm_loadu_si128(lutPtr); lutPtr ++; for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j); output[l] += x; } for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); output[l] += x; } } for (int i=16*(in_len/16);i<in_len;i++) { output[deinter[i%out_len]] += input[i]; } } else { int intCnt = 16; int inputCnt = 0; int nwrapps = 0; while(inputCnt < in_len - 16) { xVal = _mm_loadu_si128(xPtr); xPtr ++; lutVal1 = _mm_loadu_si128(lutPtr); lutPtr++; lutVal2 = _mm_loadu_si128(lutPtr); lutPtr ++; for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j); output[l] += x; } for (int j=0;j<8;j++) { int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); output[l] += x; } intCnt += 16; inputCnt += 16; if (intCnt >= out_len && inputCnt < in_len - 16) { /* Copy last elements */ if ((out_len%16) == 12) { for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) { output[deinter[j%out_len]] += input[j]; inputCnt++; } } else { for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) { output[deinter[j%out_len]] += input[j]; inputCnt++; } } /* And wrap pointers */ nwrapps++; intCnt = 16; xPtr = (const __m128i*) &input[nwrapps*out_len]; lutPtr = (const __m128i*) deinter; } } for (int i=inputCnt;i<in_len;i++) { output[deinter[i%out_len]] += input[i]; } } return 0; } else { printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx); return SRSLTE_ERROR_INVALID_INPUTS; } }
inline bool compare_byIntSSE(const char * p1, const char * p2) { return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast<const __m128i *>(p1)), _mm_loadu_si128(reinterpret_cast<const __m128i *>(p2)))); }
mlib_status mlib_VideoColorJFIFYCC2RGB444_S16_naligned( mlib_s16 *rgb, const mlib_s16 *y, const mlib_s16 *cb, const mlib_s16 *cr, mlib_s32 n) { /* 0 & 1.402*16384 */ const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970, 0, 22970, 0, 22970); /* -0.34414*16384 & -0.71414*16384 */ const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700, -5638, -11700, -5638, -11700); /* 1.772*16384 & 0 */ const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0, 29032, 0, 29032, 0); const __m128i x_coff = _mm_set1_epi16(2048); const __m128i x_cps1 = _mm_set1_epi32(0x8000); const __m128i x_cps2 = _mm_set1_epi16(0x8000); const __m128i x_zero = _mm_setzero_si128(); const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0); const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0); /* __m128i variables */ __m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2; __m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2; __m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh; __m128i x_cbcr1, x_cbcr2; /* pointers */ __m128i *px_y, *px_cb, *px_cr; mlib_s16 *prgb; /* other var */ mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; prgb = rgb; i = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; i <= n - 16; i += 8) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbh); } if (i <= (n - 8)) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB2(x_rgbh); i += 8; } if (i <= (n - 4)) { x_y = _mm_loadl_epi64(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); px_y = (__m128i *)(((__m64 *)px_y) + 1); x_cb = _mm_loadl_epi64(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb = (__m128i *)(((__m64 *)px_cb) + 1); x_cr = _mm_loadl_epi64(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr = (__m128i *)(((__m64 *)px_cr) + 1); x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r = _mm_packs_epi32(x_r1, x_zero); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g = _mm_packs_epi32(x_g1, x_zero); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b = _mm_packs_epi32(x_b1, x_zero); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB2(x_rgbh); i += 4; } /* pure C implementation */ for (; i < n; i++) { fy = y[i] * SCALE - SAT; fcb = (mlib_d64)((cb[i] - 2048) << 20); fcr = (mlib_d64)((cr[i] - 2048) << 20); fr = fy + 1.40200f * fcr; fg = fy - 0.34414f * fcb - 0.71414f * fcr; fb = fy + 1.77200f * fcb; rgb[3 * i] = CLAMP_U12(fr); rgb[3 * i + 1] = CLAMP_U12(fg); rgb[3 * i + 2] = CLAMP_U12(fb); } return (MLIB_SUCCESS); }
void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; end += default_partition_samples; for( ; (int)residual_sample < (int)end-7; residual_sample+=8) { __m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample))); sum256 = _mm256_add_epi32(sum256, res256); } sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256)); for( ; (int)residual_sample < (int)end-3; residual_sample+=4) { __m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); sum128 = _mm_add_epi32(sum128, res128); } for( ; residual_sample < end; residual_sample++) { __m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); sum128 = _mm_add_epi32(sum128, res128); } sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1,0,3,2))); sum128 = _mm_add_epi32(sum128, _mm_shufflelo_epi16(sum128, _MM_SHUFFLE(1,0,3,2))); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128); /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) abs_residual_partition_sums[partition] &= 0xFFFFFFFF; /**/ #endif } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; end += default_partition_samples; for( ; (int)residual_sample < (int)end-3; residual_sample+=4) { __m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); __m256i res256 = _mm256_cvtepu32_epi64(res128); sum256 = _mm256_add_epi64(sum256, res256); } sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256)); for( ; (int)residual_sample < (int)end-1; residual_sample+=2) { __m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); res128 = _mm_cvtepu32_epi64(res128); sum128 = _mm_add_epi64(sum128, res128); } for( ; residual_sample < end; residual_sample++) { __m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); sum128 = _mm_add_epi64(sum128, res128); } sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128); } } } /* now merge partitions for lower orders */ { uint32_t from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { uint32_t i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } _mm256_zeroupper(); }
uint64_t siphash(const unsigned char key[16], const unsigned char *m, size_t len) { xmmi k,v02,v20,v13,v11,v33,mi; uint64_t last7; uint32_t lo, hi; size_t i, blocks; k = _mm_loadu_si128((xmmi *)(key + 0)); v02 = siphash_init[0].v; v13 = siphash_init[1].v; v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); last7 = (uint64_t)(len & 0xff) << 56; #define sipcompress() \ v11 = v13; \ v33 = v13; \ v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \ v02 = _mm_add_epi64(v02, v13); \ v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \ v13 = _mm_unpacklo_epi64(v11, v33); \ v13 = _mm_xor_si128(v13, v02); \ v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ v11 = v13; \ v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \ v20 = _mm_add_epi64(v20, v13); \ v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \ v13 = _mm_unpacklo_epi64(v11, v33); \ v13 = _mm_unpacklo_epi64(v11, v33); \ v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ v13 = _mm_xor_si128(v13, v20); for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { mi = _mm_loadl_epi64((xmmi *)(m + i)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() sipcompress() v02 = _mm_xor_si128(v02, mi); } switch (len - blocks) { case 7: last7 |= (uint64_t)m[i + 6] << 48; case 6: last7 |= (uint64_t)m[i + 5] << 40; case 5: last7 |= (uint64_t)m[i + 4] << 32; case 4: last7 |= (uint64_t)m[i + 3] << 24; case 3: last7 |= (uint64_t)m[i + 2] << 16; case 2: last7 |= (uint64_t)m[i + 1] << 8; case 1: last7 |= (uint64_t)m[i + 0] ; case 0: default:; }; mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32))); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() sipcompress() v02 = _mm_xor_si128(v02, mi); v02 = _mm_xor_si128(v02, siphash_final.v); sipcompress() sipcompress() sipcompress() sipcompress() v02 = _mm_xor_si128(v02, v13); v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); lo = _mm_cvtsi128_si32(v02); hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4)); return ((uint64_t)hi << 32) | lo; }
int main(int argc, char **argv) { struct timespec t1, t2; int c, d, k, sum = 0; int size, opt, i; char *fname; while((opt = getopt(argc, argv, "f:s:"))!= -1) { switch (opt){ case 's': size = atoi(optarg); break; case 'f': fname = optarg; break; default: size = MEDIUM; break; } } FILE *fp; fp = fopen(fname,"a"); int edge; int *first; posix_memalign((void**)&first,16,sizeof(int)*size*size); //use posix_memalign to get 16byte alignment int *multiply; posix_memalign((void**)&multiply,16,sizeof(int)*size*size); __m128i m1, m2,m3; for ( c = 0 ; c < size ; c++ ) for ( d = 0 ; d < size ; d++ ) first[c*size+d] = ((c+d) % 2) - 1; multiply[c*size+d] = 0; printf("multiplying the %d-size matrices\n You should try to time this part.\n",size); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t1); for ( c = 0 ; c < size ; c++ ) { for ( k = 0 ; k < size ; k++ ) { m2 = _mm_set1_epi32(first[c*size+k]); //first[c][k] for (d = 0 ; d < size ; d+=4) { edge = size - d; if (edge < 4){ //account for non-div by 4 matrices for (i = d; i < size; i++) multiply[c*size+i] += first[c*size+k]*first[k*size+i]; } else{ m1 = _mm_loadu_si128(&first[k*size+d]); //first[k][d] m1 = _mm_mullo_epi32(m1,m2); // first[k][d] * first[c][k] m3 = _mm_loadu_si128(&multiply[c*size+d]);//load up old values of multiply[c][d] m1 = _mm_add_epi32(m3,m1); //[+= to mult] _mm_storeu_si128(&multiply[c*size+d],m1); } } } } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t2); double nanos = (diff(t1,t2).tv_nsec) * pow(10,-9); double secs = (diff(t1,t2).tv_sec); double dif = secs + nanos; fprintf(fp,"%.10f\n", dif); fclose(fp); printf("test first %d\n",first[size]); printf("test mult %d\n",multiply[size]); free(first); //free SSE aligned array with _aligned_free free(multiply); return 0; }
template<int pixelFormat> void imageFromPixels(vl::Image & image, char unsigned const * rgb, int rowStride) { vl::ImageShape const & shape = image.getShape() ; int blockSizeX ; int blockSizeY ; int pixelStride ; int imagePlaneStride = (int)shape.width * (int)shape.height ; __m128i shuffleRgb ; __m128i const shuffleL = _mm_set_epi8(0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 0) ; __m128i const mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff) ; switch (pixelFormat) { case pixelFormatL: pixelStride = 1 ; blockSizeX = 16 ; blockSizeY = 4 ; break ; case pixelFormatBGR: case pixelFormatRGB: pixelStride = 3 ; blockSizeX = 4 ; blockSizeY = 4 ; assert(shape.depth == 3) ; break ; case pixelFormatRGBA: case pixelFormatBGRA: case pixelFormatBGRAasL: pixelStride = 4 ; blockSizeX = 4 ; blockSizeY = 4 ; assert(shape.depth == 3) ; break ; default: assert(false) ; } switch (pixelFormat) { case pixelFormatL: break ; case pixelFormatRGB: shuffleRgb = _mm_set_epi8(0xff, 11, 10, 9, 0xff, 8, 7, 6, 0xff, 5, 4, 3, 0xff, 2, 1, 0) ; break ; case pixelFormatRGBA: shuffleRgb = _mm_set_epi8(0xff, 14, 13, 12, 0xff, 10, 9, 8, 0xff, 6, 5, 4, 0xff, 2, 1, 0) ; break ; case pixelFormatBGR: shuffleRgb = _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 4, 0xff, 0, 1, 2) ; break ; case pixelFormatBGRA: shuffleRgb = _mm_set_epi8(0xff, 12, 13, 14, 0xff, 8, 9, 10, 0xff, 4, 5, 6, 0xff, 0, 1, 2) ; break ; case pixelFormatBGRAasL: shuffleRgb = _mm_set_epi8(0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff, 8, 0xff, 0xff, 0xff, 4, 0xff, 0xff, 0xff, 0) ; break ; } // we pull out these values as otherwise the compiler // will assume that the reference &image can be aliased // and recompute silly multiplications in the inner loop float * const __restrict imageMemory = image.getMemory() ; int const imageHeight = (int)shape.height ; int const imageWidth = (int)shape.width ; for (int x = 0 ; x < imageWidth ; x += blockSizeX) { int y = 0 ; float * __restrict imageMemoryX = imageMemory + x * imageHeight ; int bsx = (std::min)(imageWidth - x, blockSizeX) ; if (bsx < blockSizeX) goto boundary ; for ( ; y < imageHeight - blockSizeY + 1 ; y += blockSizeY) { char unsigned const * __restrict pixel = rgb + y * rowStride + x * pixelStride ; float * __restrict r = imageMemoryX + y ; __m128i p0, p1, p2, p3, T0, T1, T2, T3 ; /* convert a blockSizeX x blockSizeY block in the input image */ switch (pixelFormat) { case pixelFormatRGB : case pixelFormatRGBA : case pixelFormatBGR : case pixelFormatBGRA : case pixelFormatBGRAasL : // load 4x4 RGB pixels p0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p1 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p2 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p3 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; // transpose pixels as 32-bit integers (see also below) T0 = _mm_unpacklo_epi32(p0, p1); T1 = _mm_unpacklo_epi32(p2, p3); T2 = _mm_unpackhi_epi32(p0, p1); T3 = _mm_unpackhi_epi32(p2, p3); p0 = _mm_unpacklo_epi64(T0, T1); p1 = _mm_unpackhi_epi64(T0, T1); p2 = _mm_unpacklo_epi64(T2, T3); p3 = _mm_unpackhi_epi64(T2, T3); // store r _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; if (pixelFormat == pixelFormatBGRAasL) break ; // store g r += (imageWidth - 3) * imageHeight ; p0 = _mm_srli_epi32 (p0, 8) ; p1 = _mm_srli_epi32 (p1, 8) ; p2 = _mm_srli_epi32 (p2, 8) ; p3 = _mm_srli_epi32 (p3, 8) ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; // store b r += (imageWidth - 3) * imageHeight ; p0 = _mm_srli_epi32 (p0, 8) ; p1 = _mm_srli_epi32 (p1, 8) ; p2 = _mm_srli_epi32 (p2, 8) ; p3 = _mm_srli_epi32 (p3, 8) ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; break ; case pixelFormatL: // load 4x16 L pixels p0 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p1 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p2 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p3 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; /* Pixels are collected in little-endian order: the first pixel is at the `right' (least significant byte of p0: p[0] = a, p[1] = b, ... p0: [ ... | ... | ... | d c b a ] p1: [ ... | ... | ... | h g f e ] p2: [ ... | ... | ... | l k j i ] p3: [ ... | ... | ... | p o n m ] The goal is to transpose four 4x4 subblocks in the 4 x 16 pixel array. The first step interlaves individual pixels in p0 and p1: T0: [ ... | ... | h d g c | f b e a ] T1: [ ... | ... | p l o k | n j m i ] T2: [ ... | ... | ... | ... ] T3: [ ... | ... | ... | ... ] The second step interleaves groups of two pixels: p0: [pl hd | ok gc | nj fb | mi ea] (pixels in the rightmost 4x4 subblock) p1: ... p2: ... p3: ... The third step interlevaes groups of four pixels: T0: [ ... | njfb | ... | miea ] T1: ... T2: ... T3: ... The last step interleaves groups of eight pixels: p0: [ ... | ... | ... | miea ] p1: [ ... | ... | ... | njfb ] p2: [ ... | ... | ... | okgc ] p3: [ ... | ... | ... | dklp ] */ T0 = _mm_unpacklo_epi8(p0, p1); T1 = _mm_unpacklo_epi8(p2, p3); T2 = _mm_unpackhi_epi8(p0, p1); T3 = _mm_unpackhi_epi8(p2, p3); p0 = _mm_unpacklo_epi16(T0, T1); p1 = _mm_unpackhi_epi16(T0, T1); p2 = _mm_unpacklo_epi16(T2, T3); p3 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(p0, p1); T1 = _mm_unpacklo_epi32(p2, p3); T2 = _mm_unpackhi_epi32(p0, p1); T3 = _mm_unpackhi_epi32(p2, p3); p0 = _mm_unpacklo_epi64(T0, T1); p1 = _mm_unpackhi_epi64(T0, T1); p2 = _mm_unpacklo_epi64(T2, T3); p3 = _mm_unpackhi_epi64(T2, T3); // store four 4x4 subblock for (int i = 0 ; i < 4 ; ++i) { _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p0, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p1, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p2, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p3, shuffleL))) ; r += imageHeight ; p0 = _mm_srli_si128 (p0, 4) ; p1 = _mm_srli_si128 (p1, 4) ; p2 = _mm_srli_si128 (p2, 4) ; p3 = _mm_srli_si128 (p3, 4) ; } break ; } } /* next y */ boundary: /* special case if there is not a full 4x4 block to process */ for ( ; y < imageHeight ; y += blockSizeY) { int bsy = (std::min)(imageHeight - y, blockSizeY) ; float * __restrict r ; float * rend ; for (int dx = 0 ; dx < bsx ; ++dx) { char unsigned const * __restrict pixel = rgb + y * rowStride + (x + dx) * pixelStride ; r = imageMemoryX + y + dx * imageHeight ; rend = r + bsy ; while (r != rend) { switch (pixelFormat) { case pixelFormatRGBA: case pixelFormatRGB: r[0 * imagePlaneStride] = (float) pixel[0] ; r[1 * imagePlaneStride] = (float) pixel[1] ; r[2 * imagePlaneStride] = (float) pixel[2] ; break ; case pixelFormatBGR: case pixelFormatBGRA: r[2 * imagePlaneStride] = (float) pixel[0] ; r[1 * imagePlaneStride] = (float) pixel[1] ; r[0 * imagePlaneStride] = (float) pixel[2] ; break; case pixelFormatBGRAasL: case pixelFormatL: r[0] = (float) pixel[0] ; break ; } r += 1 ; pixel += rowStride ; } } } } }
/* Encryption key setup */ static void aes_key_setup_enc(__m128i rk[], const u8* cipherKey, int keylen) { switch (keylen) { case 16: { /* 128 bit key setup */ rk[0] = _mm_loadu_si128((const __m128i*) cipherKey); rk[1] = KEYEXP128(rk[0], 0x01); rk[2] = KEYEXP128(rk[1], 0x02); rk[3] = KEYEXP128(rk[2], 0x04); rk[4] = KEYEXP128(rk[3], 0x08); rk[5] = KEYEXP128(rk[4], 0x10); rk[6] = KEYEXP128(rk[5], 0x20); rk[7] = KEYEXP128(rk[6], 0x40); rk[8] = KEYEXP128(rk[7], 0x80); rk[9] = KEYEXP128(rk[8], 0x1B); rk[10] = KEYEXP128(rk[9], 0x36); break; } case 24: { /* 192 bit key setup */ __m128i temp[2]; rk[0] = _mm_loadu_si128((const __m128i*) cipherKey); rk[1] = _mm_loadu_si128((const __m128i*) (cipherKey+16)); temp[0] = KEYEXP192(rk[0], rk[1], 0x01); temp[1] = KEYEXP192_2(temp[0], rk[1]); rk[1] = (__m128i)_mm_shuffle_pd((__m128d)rk[1], (__m128d)temp[0], 0); rk[2] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1); rk[3] = KEYEXP192(temp[0], temp[1], 0x02); rk[4] = KEYEXP192_2(rk[3], temp[1]); temp[0] = KEYEXP192(rk[3], rk[4], 0x04); temp[1] = KEYEXP192_2(temp[0], rk[4]); rk[4] = (__m128i)_mm_shuffle_pd((__m128d)rk[4], (__m128d)temp[0], 0); rk[5] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1); rk[6] = KEYEXP192(temp[0], temp[1], 0x08); rk[7] = KEYEXP192_2(rk[6], temp[1]); temp[0] = KEYEXP192(rk[6], rk[7], 0x10); temp[1] = KEYEXP192_2(temp[0], rk[7]); rk[7] = (__m128i)_mm_shuffle_pd((__m128d)rk[7], (__m128d)temp[0], 0); rk[8] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1); rk[9] = KEYEXP192(temp[0], temp[1], 0x20); rk[10] = KEYEXP192_2(rk[9], temp[1]); temp[0] = KEYEXP192(rk[9], rk[10], 0x40); temp[1] = KEYEXP192_2(temp[0], rk[10]); rk[10] = (__m128i)_mm_shuffle_pd((__m128d)rk[10], (__m128d) temp[0], 0); rk[11] = (__m128i)_mm_shuffle_pd((__m128d)temp[0],(__m128d) temp[1], 1); rk[12] = KEYEXP192(temp[0], temp[1], 0x80); break; } case 32: { /* 256 bit key setup */ rk[0] = _mm_loadu_si128((const __m128i*) cipherKey); rk[1] = _mm_loadu_si128((const __m128i*) (cipherKey+16)); rk[2] = KEYEXP256(rk[0], rk[1], 0x01); rk[3] = KEYEXP256_2(rk[1], rk[2]); rk[4] = KEYEXP256(rk[2], rk[3], 0x02); rk[5] = KEYEXP256_2(rk[3], rk[4]); rk[6] = KEYEXP256(rk[4], rk[5], 0x04); rk[7] = KEYEXP256_2(rk[5], rk[6]); rk[8] = KEYEXP256(rk[6], rk[7], 0x08); rk[9] = KEYEXP256_2(rk[7], rk[8]); rk[10] = KEYEXP256(rk[8], rk[9], 0x10); rk[11] = KEYEXP256_2(rk[9], rk[10]); rk[12] = KEYEXP256(rk[10], rk[11], 0x20); rk[13] = KEYEXP256_2(rk[11], rk[12]); rk[14] = KEYEXP256(rk[12], rk[13], 0x40); break; } } }
__m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) { // CHECK-LABEL: @test_mm256_maskz_broadcast_i64x2 // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_broadcast_i64x2(__M, _mm_loadu_si128(__A)); }
/* * memmove_nodrain_movnt -- (internal) memmove to pmem without hw drain, movnt */ static void * memmove_nodrain_movnt(void *pmemdest, const void *src, size_t len) { LOG(15, "pmemdest %p src %p len %zu", pmemdest, src, len); __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; size_t i; __m128i *d; __m128i *s; void *dest1 = pmemdest; size_t cnt; if (len == 0 || src == pmemdest) return pmemdest; if (len < Movnt_threshold) { memmove(pmemdest, src, len); pmem_flush(pmemdest, len); return pmemdest; } if ((uintptr_t)dest1 - (uintptr_t)src >= len) { /* * Copy the range in the forward direction. * * This is the most common, most optimized case, used unless * the overlap specifically prevents it. */ /* copy up to FLUSH_ALIGN boundary */ cnt = (uint64_t)dest1 & ALIGN_MASK; if (cnt > 0) { cnt = FLUSH_ALIGN - cnt; /* never try to copy more the len bytes */ if (cnt > len) cnt = len; uint8_t *d8 = (uint8_t *)dest1; const uint8_t *s8 = (uint8_t *)src; for (i = 0; i < cnt; i++) { *d8 = *s8; d8++; s8++; } pmem_flush(dest1, cnt); dest1 = (char *)dest1 + cnt; src = (char *)src + cnt; len -= cnt; } d = (__m128i *)dest1; s = (__m128i *)src; cnt = len >> CHUNK_SHIFT; for (i = 0; i < cnt; i++) { xmm0 = _mm_loadu_si128(s); xmm1 = _mm_loadu_si128(s + 1); xmm2 = _mm_loadu_si128(s + 2); xmm3 = _mm_loadu_si128(s + 3); xmm4 = _mm_loadu_si128(s + 4); xmm5 = _mm_loadu_si128(s + 5); xmm6 = _mm_loadu_si128(s + 6); xmm7 = _mm_loadu_si128(s + 7); s += 8; _mm_stream_si128(d, xmm0); _mm_stream_si128(d + 1, xmm1); _mm_stream_si128(d + 2, xmm2); _mm_stream_si128(d + 3, xmm3); _mm_stream_si128(d + 4, xmm4); _mm_stream_si128(d + 5, xmm5); _mm_stream_si128(d + 6, xmm6); _mm_stream_si128(d + 7, xmm7); VALGRIND_DO_FLUSH(d, 8 * sizeof (*d)); d += 8; } /* copy the tail (<128 bytes) in 16 bytes chunks */ len &= CHUNK_MASK; if (len != 0) { cnt = len >> MOVNT_SHIFT; for (i = 0; i < cnt; i++) { xmm0 = _mm_loadu_si128(s); _mm_stream_si128(d, xmm0); VALGRIND_DO_FLUSH(d, sizeof (*d)); s++; d++; } }
_mm256_loadu2_m128i(const __m128i* const hiaddr, const __m128i* const loaddr) { return _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128(loaddr)), _mm_loadu_si128(hiaddr), 1); }
void* memccpy(void *dst, void *src, int c, size_t len) { uint8_t* a = dst; uint8_t* b = src; uint8_t endchar = c & 0xff; if(!len) return NULL; int aligned_a = 0, aligned_b = 0; int i = 0; aligned_a = ((uintptr_t)a & (sizeof(__m128i) - 1)); aligned_b = ((uintptr_t)b & (sizeof(__m128i) - 1)); /* Not aligned */ if(aligned_a != aligned_b) { while(len) { if(b[i] == endchar) { a[i] = b[i]; return a + i; } a[i] = b[i]; i++; len--; } return NULL; } /* aligned */ if(aligned_a) { while(len && ((uintptr_t) &a[i] & ( sizeof(__m128i)-1))) { if(b[i] == endchar) { a[i] = b[i]; return a + i; } a[i] = b[i]; i++; len--; } } if(len >= 16) { uint32_t buf_32 = endchar; buf_32 |= (buf_32 << 8); buf_32 |= (buf_32 << 16); __m128i r1 = _mm_set_epi32(buf_32, buf_32, buf_32, buf_32); while(len >= 16) { __m128i y = _mm_loadu_si128((__m128i*)&(b[i])); //16byte __m128i cmp = _mm_cmpeq_epi8(y, r1); uint16_t result = (uint16_t)_mm_movemask_epi8(cmp); if(result != 0x0) { //result = ~result; while(1) { if(result & 0x1) { a[i] = b[i]; return a + i; } a[i] = b[i]; result = result >> 1; i++; } } _mm_store_si128((__m128i*)&a[i], y); i += 16; len -= 16; } }
mlib_status __mlib_ImageColorOrderedDitherMxN( mlib_image *dst, const mlib_image *src, const mlib_s32 **dmask, mlib_s32 m, mlib_s32 n, mlib_s32 scale, const void *colormap) { mlib_type stype, dtype; const mlib_s32 *dmask0, *dmask1, *dmask2, *dmask3; mlib_u8 *sl, *dl; mlib_s32 nchan, dchan, sll, dll, sw, sh, dw, dh, num_blk; mlib_s32 off, mstep, line_size, kern_size, dsize, i, j, k, fun_ind; __m128i *pbuff, *pb; mlib_s32 *p_dim; mlib_u8 *kern, *pkern; __m128i *dkern; __m128i ss, d0, d1; __m128i k0, k1; mlib_s32 step0, step1, step2, step3; mlib_d64 srange, dscale, dscale0, dscale1, dscale2, dscale3; mlib_s32 half_step0, half_step1, half_step2, half_step3; mlib_s32 v0, v1, v2, v3; __m128i _s_zero = _mm_xor_si128(_s_zero, _s_zero); line_func_type line_func; MLIB_IMAGE_CHECK(src); MLIB_IMAGE_CHECK(dst); MLIB_IMAGE_SIZE_EQUAL(src, dst); MLIB_IMAGE_HAVE_CHAN(dst, 1); MLIB_IMAGE_AND_COLORMAP_ARE_COMPAT(src, colormap); MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl); MLIB_IMAGE_GET_ALL_PARAMS(src, stype, nchan, sw, sh, sll, sl); if (stype == MLIB_BYTE && nchan == 1 && dtype == MLIB_BIT) { return mlib_ImageColorOrderedDitherBit_MxN(dst, src, dmask, m, n, scale, colormap); } if (!(stype == MLIB_BYTE || stype == MLIB_SHORT)) { return (MLIB_FAILURE); } if (!(dtype == MLIB_BYTE || dtype == MLIB_SHORT)) { return (MLIB_FAILURE); } if (!(nchan >= 3 && nchan <= 4)) { return (MLIB_FAILURE); } if (dmask == NULL || colormap == NULL) { return (MLIB_NULLPOINTER); } if (scale <= 0) { return (MLIB_OUTOFRANGE); } fun_ind = nchan - 3; if (dtype == MLIB_SHORT) fun_ind += 2; if (stype == MLIB_SHORT) fun_ind += 4; line_func = line_func_arr[fun_ind]; num_blk = (sw + (m - 1)) / m; mstep = m * nchan; GET_STEPS; if (stype == MLIB_BYTE) { FILL_KERN(mlib_s16); dsize = (nchan * sw + 15) / 16; } else { FILL_KERN(mlib_s32); dsize = (nchan * sw + 7) / 8; } pbuff = __mlib_malloc(dsize * sizeof (__m128i)); if (pbuff == NULL) { __mlib_free(kern); return (MLIB_FAILURE); } pkern = kern; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif for (j = 0; j < sh; j++) { dkern = (__m128i *)pkern; __m128i *sp = (__m128i *)sl; pb = pbuff; if (stype == MLIB_BYTE) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif for (i = 0; i < dsize; i++) { ss = _mm_loadu_si128(sp); sp++; k0 = _mm_loadu_si128(dkern); dkern++; k1 = _mm_loadu_si128(dkern); dkern++; d0 = _mm_unpacklo_epi8(ss, _s_zero); d1 = _mm_unpackhi_epi8(ss, _s_zero); d0 = _mm_add_epi16(d0, k0); d1 = _mm_add_epi16(d1, k1); d1 = _mm_packus_epi16(d0, d1); _mm_storeu_si128(pb, d1); pb++; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif for (i = 0; i < dsize; i++) { ss = _mm_loadu_si128(sp); sp++; k0 = _mm_loadu_si128(dkern); dkern++; k1 = _mm_loadu_si128(dkern); dkern++; d0 = _mm_srai_epi32( _mm_unpacklo_epi16(_s_zero, ss), 16); d1 = _mm_srai_epi32( _mm_unpackhi_epi16(_s_zero, ss), 16); d0 = _mm_add_epi32(d0, k0); d1 = _mm_add_epi32(d1, k1); d1 = _mm_packs_epi32(d0, d1); _mm_storeu_si128(pb, d1); pb++; } } pkern += line_size; if (pkern >= kern + kern_size) pkern = kern; line_func(pbuff, dl, sw, colormap); sl += sll; dl += dll; } __mlib_free(pbuff); __mlib_free(kern); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoP64Decimate_U8_U8( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 width, mlib_s32 height, mlib_s32 dst_stride, mlib_s32 src_stride) { mlib_s32 x, y; const mlib_u8 *sd1, *sd2; mlib_u8 *dd; mlib_u32 src_stride2; sd1 = src; sd2 = src + src_stride; src_stride2 = 2 * src_stride; dd = dst; mlib_s32 dw = width & 0xF; __m128i txmm0, txmm1, txmm2, txmm3, txmm4, txmm5, txmm6, txmm7; txmm7 = _mm_set1_epi16(0xff); for (y = 0; y < height; y++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (x = 0; x < width - dw; x += 16) { txmm0 = _mm_loadu_si128((__m128i *)&sd1[2*x]); txmm1 = _mm_loadu_si128((__m128i *)&sd2[2*x]); txmm2 = _mm_srli_si128(txmm0, 1); txmm3 = _mm_srli_si128(txmm1, 1); txmm4 = _mm_avg_epu8(txmm0, txmm2); txmm5 = _mm_avg_epu8(txmm1, txmm3); txmm6 = _mm_avg_epu8(txmm5, txmm4); txmm6 = _mm_and_si128(txmm6, txmm7); txmm0 = _mm_loadu_si128((__m128i *)&sd1[2 * x + 16]); txmm1 = _mm_loadu_si128((__m128i *)&sd2[2 * x + 16]); txmm2 = _mm_srli_si128(txmm0, 1); txmm3 = _mm_srli_si128(txmm1, 1); txmm4 = _mm_avg_epu8(txmm0, txmm2); txmm5 = _mm_avg_epu8(txmm1, txmm3); txmm5 = _mm_avg_epu8(txmm5, txmm4); txmm5 = _mm_and_si128(txmm5, txmm7); txmm1 = _mm_packus_epi16(txmm6, txmm5); _mm_storeu_si128((__m128i *)&dd[x], txmm1); } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; x < width; x++) { dd[x] = (sd1[x * 2] + sd1[x * 2 + 1] + sd2[x * 2] + sd2[x * 2 + 1] + 2) >> 2; } sd1 += src_stride2; sd2 += src_stride2; dd += dst_stride; } return (MLIB_SUCCESS); }
// This is ready to be ported to AVX2, by adding 8 permute instructions (see also XXX below) inline void _assembler_kernel(__m128i &x0, __m128i &x1, __m128i &x2, __m128i &x3, __m128i &x4, __m128i &x5, __m128i &x6, __m128i &x7, const __m128i *src) { __m128i a0 = _mm_loadu_si128(src); __m128i a1 = _mm_loadu_si128(src+1); __m128i a2 = _mm_loadu_si128(src+2); __m128i a3 = _mm_loadu_si128(src+3); __m128i a4 = _mm_loadu_si128(src+4); __m128i a5 = _mm_loadu_si128(src+5); __m128i a6 = _mm_loadu_si128(src+6); __m128i a7 = _mm_loadu_si128(src+7); static const __m128i ctl0 = _mm_set_epi8(15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0); static const __m128i ctl1 = _mm_set_epi8(14,6,15,7,12,4,13,5,10,2,11,3,8,0,9,1); // Note: _mm_shuffle_epi8 is expensive, so we use 8 calls, which is the minimum possible a0 = _mm_shuffle_epi8(a0, ctl0); a1 = _mm_shuffle_epi8(a1, ctl1); a2 = _mm_shuffle_epi8(a2, ctl0); a3 = _mm_shuffle_epi8(a3, ctl1); a4 = _mm_shuffle_epi8(a4, ctl0); a5 = _mm_shuffle_epi8(a5, ctl1); a6 = _mm_shuffle_epi8(a6, ctl0); a7 = _mm_shuffle_epi8(a7, ctl1); __m128i b0 = _mm_blend_epi16(a0, a1, 0xaa); // (10101010)_2 __m128i b1 = _mm_blend_epi16(a1, a0, 0xaa); __m128i b2 = _mm_blend_epi16(a2, a3, 0xaa); __m128i b3 = _mm_blend_epi16(a3, a2, 0xaa); __m128i b4 = _mm_blend_epi16(a4, a5, 0xaa); __m128i b5 = _mm_blend_epi16(a5, a4, 0xaa); __m128i b6 = _mm_blend_epi16(a6, a7, 0xaa); __m128i b7 = _mm_blend_epi16(a7, a6, 0xaa); b1 = _mm_shufflelo_epi16(_mm_shufflehi_epi16(b1, 0xb1), 0xb1); b5 = _mm_shufflelo_epi16(_mm_shufflehi_epi16(b5, 0xb1), 0xb1); b2 = _mm_shuffle_epi32(b2, 0xb1); // (2301)_4 b6 = _mm_shuffle_epi32(b6, 0xb1); // (2301)_4 b3 = _mm_shufflelo_epi16(_mm_shufflehi_epi16(b3, 0x1b), 0x1b); // (0123)_4 b7 = _mm_shufflelo_epi16(_mm_shufflehi_epi16(b7, 0x1b), 0x1b); // XXX when switching to AVX2, replace blend_epi16(0xcc) -> blend_epi32(0xa) for a small performance boost a0 = _mm_blend_epi16(b0, b2, 0xcc); // (11001100)_2 a2 = _mm_blend_epi16(b2, b0, 0xcc); a1 = _mm_blend_epi16(b1, b3, 0xcc); a3 = _mm_blend_epi16(b3, b1, 0xcc); a4 = _mm_blend_epi16(b4, b6, 0xcc); a6 = _mm_blend_epi16(b6, b4, 0xcc); a5 = _mm_blend_epi16(b5, b7, 0xcc); a7 = _mm_blend_epi16(b7, b5, 0xcc); a2 = _mm_shuffle_epi32(a2, 0xb1); // (2301)_4 a3 = _mm_shuffle_epi32(a3, 0xb1); // (2301)_4 a4 = _mm_shuffle_epi32(a4, 0x4e); // (1032)_4 a5 = _mm_shuffle_epi32(a5, 0x4e); // (1032)_4 a6 = _mm_shuffle_epi32(a6, 0x1b); // (0123)_4 a7 = _mm_shuffle_epi32(a7, 0x1b); // (0123)_4 // XXX when switching to AVX2, replace blend_epi16(0xf0) -> blend_epi32(0xc) for a small performance boost b0 = _mm_blend_epi16(a0, a4, 0xf0); // (11110000)_2 b4 = _mm_blend_epi16(a4, a0, 0xf0); // (11110000)_2 b1 = _mm_blend_epi16(a1, a5, 0xf0); // (11110000)_2 b5 = _mm_blend_epi16(a5, a1, 0xf0); // (11110000)_2 b2 = _mm_blend_epi16(a2, a6, 0xf0); // (11110000)_2 b6 = _mm_blend_epi16(a6, a2, 0xf0); // (11110000)_2 b3 = _mm_blend_epi16(a3, a7, 0xf0); // (11110000)_2 b7 = _mm_blend_epi16(a7, a3, 0xf0); // (11110000)_2 b4 = _mm_shuffle_epi32(b4, 0x4e); // (1032)_4 b5 = _mm_shuffle_epi32(b5, 0x4e); // (1032)_4 b6 = _mm_shuffle_epi32(b6, 0x4e); // (1032)_4 b7 = _mm_shuffle_epi32(b7, 0x4e); // (1032)_4 x0 = b0; x1 = b1; x2 = b2; x3 = b3; x4 = b4; x5 = b5; x6 = b6; x7 = b7; }
EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_16bit_SSE2_INTRIN( EB_U16 *inputSamplePtr, // input parameter, source Picture Ptr EB_U32 inputStride, // input parameter, source stride EB_U16 *reconSamplePtr, // input parameter, deblocked Picture Ptr EB_U32 reconStride, // input parameter, deblocked stride EB_U32 lcuWidth, // input parameter, LCU width EB_U32 lcuHeight, // input parameter, LCU height EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1], // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES] EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1]) // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES] { #define boShift 5 EB_ERRORTYPE return_error = EB_ErrorNone; EB_U64 count_x, count_y; EB_S32 diff; __m128i xmm0, xmm_1, xmm_N1, xmm_N3, xmm_N4, xmm_skip_mask, xmm9, xmm10, xmm11, xmm12, xmm13, xmm15; __m128i xmm_temp_input1, xmm_temp_input2, xmm_temp_recon1, xmm_temp_recon2, xmm_diff1, xmm_diff2; __m128i xmm_sign_1, xmm_sign_1a, xmm_sign_1b, xmm_sign_2a, xmm_sign_2b, xmm_sign_2, xmm_eoIndex; xmm0 = _mm_setzero_si128(); xmm12 = _mm_setzero_si128(); xmm15 = _mm_set1_epi16(0x0001); xmm_N1 = _mm_set1_epi8((signed char)0xFF); xmm_N3 = _mm_set1_epi8((signed char)0xFD); xmm_N4 = _mm_set1_epi8((signed char)0xFC); xmm_1 = _mm_sub_epi8(xmm0, xmm_N1); // Initialize SAO Arrays EB_ALIGN(16) EB_S8 rTemp[512] = { 0 }; EB_U64 reconStrideTemp; lcuHeight -= 2; inputSamplePtr += inputStride + 1; reconSamplePtr++; if (lcuWidth == 16) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 2; } else if (lcuWidth == 28) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 6); for (count_y = 0; count_y < lcuHeight; ++count_y) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) //----------- 16-25 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 16)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 24)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 16)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 24)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 12); //skip last 6 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 12); //skip last 6 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr+16, reconSamplePtr+2*reconStride+16) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr+15, reconSamplePtr+2*reconStride+17) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+17, reconSamplePtr+2*reconStride+15) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 6 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStride; } lcuWidth = 6; } else if (lcuWidth == 56) { xmm_skip_mask = _mm_srli_si128(xmm_N1, 10); lcuWidth -= 8; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-53 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff1 = _mm_slli_si128(xmm_diff1, 4); //skip last 10 samples xmm_diff1 = _mm_srli_si128(xmm_diff1, 4); //skip last 10 samples // EO-90 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX_HALF(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 10 samples MACRO_GATHER_EO_HALF(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 10; } else { lcuWidth -= 16; inputStride -= lcuWidth; reconStrideTemp = reconStride - lcuWidth; xmm_skip_mask = _mm_srli_si128(xmm_N1, 2); for (count_y = 0; count_y < lcuHeight; ++count_y) { for (count_x = 0; count_x < lcuWidth; count_x += 16) { //----------- 0-15 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); //EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr + 2 * reconStride) MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) //EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr - 1, reconSamplePtr + 2 * reconStride + 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) //EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr + 1, reconSamplePtr + 2 * reconStride - 1) MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += 16; reconSamplePtr += 16; } //----------- 48-61 ----------- xmm_temp_recon1 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride)); xmm_temp_recon2 = _mm_loadu_si128((__m128i *)(reconSamplePtr + reconStride + 8)); xmm_temp_input1 = _mm_loadu_si128((__m128i *)(inputSamplePtr)); xmm_temp_input2 = _mm_loadu_si128((__m128i *)(inputSamplePtr + 8)); xmm_diff1 = _mm_sub_epi16(xmm_temp_input1, xmm_temp_recon1); xmm_diff2 = _mm_sub_epi16(xmm_temp_input2, xmm_temp_recon2); xmm_diff2 = _mm_slli_si128(xmm_diff2, 4); //skip last 2 samples xmm_diff2 = _mm_srli_si128(xmm_diff2, 4); //skip last 2 samples // EO-90 MACRO_CALC_EO_INDEX(reconSamplePtr, reconSamplePtr+2*reconStride) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1) // EO-135 MACRO_CALC_EO_INDEX(reconSamplePtr-1, reconSamplePtr+2*reconStride+1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2) // EO-45 MACRO_CALC_EO_INDEX(reconSamplePtr+1, reconSamplePtr+2*reconStride-1) xmm_eoIndex = _mm_and_si128(xmm_eoIndex, xmm_skip_mask); // skip last 2 samples MACRO_GATHER_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3) inputSamplePtr += inputStride; reconSamplePtr += reconStrideTemp; } lcuWidth = 2; } lcuWidth = (EB_U16)lcuWidth * (EB_U16)lcuHeight; MACRO_SAVE_EO(OFFSET_EO_DIFF_1, OFFSET_EO_COUNT_1, 1) MACRO_SAVE_EO(OFFSET_EO_DIFF_2, OFFSET_EO_COUNT_2, 2) MACRO_SAVE_EO(OFFSET_EO_DIFF_3, OFFSET_EO_COUNT_3, 3) return return_error; }
static FORCE_INLINE void blur_r6_v_middle_sse2(const PixelType *srcp, PixelType *dstp, int stride) { __m128i m6 = _mm_loadu_si128((const __m128i *)(srcp - stride * 6)); __m128i m5 = _mm_loadu_si128((const __m128i *)(srcp - stride * 5)); __m128i m4 = _mm_loadu_si128((const __m128i *)(srcp - stride * 4)); __m128i m3 = _mm_loadu_si128((const __m128i *)(srcp - stride * 3)); __m128i m2 = _mm_loadu_si128((const __m128i *)(srcp - stride * 2)); __m128i m1 = _mm_loadu_si128((const __m128i *)(srcp - stride)); __m128i l0 = _mm_loadu_si128((const __m128i *)(srcp)); __m128i l1 = _mm_loadu_si128((const __m128i *)(srcp + stride)); __m128i l2 = _mm_loadu_si128((const __m128i *)(srcp + stride * 2)); __m128i l3 = _mm_loadu_si128((const __m128i *)(srcp + stride * 3)); __m128i l4 = _mm_loadu_si128((const __m128i *)(srcp + stride * 4)); __m128i l5 = _mm_loadu_si128((const __m128i *)(srcp + stride * 5)); __m128i l6 = _mm_loadu_si128((const __m128i *)(srcp + stride * 6)); __m128i avg11 = mm_avg_epu<PixelType>(m1, l1); __m128i avg22 = mm_avg_epu<PixelType>(m2, l2); __m128i avg33 = mm_avg_epu<PixelType>(m3, l3); __m128i avg44 = mm_avg_epu<PixelType>(m4, l4); __m128i avg55 = mm_avg_epu<PixelType>(m5, l5); __m128i avg66 = mm_avg_epu<PixelType>(m6, l6); __m128i avg12 = mm_avg_epu<PixelType>(avg11, avg22); __m128i avg34 = mm_avg_epu<PixelType>(avg33, avg44); __m128i avg56 = mm_avg_epu<PixelType>(avg55, avg66); __m128i avg012 = mm_avg_epu<PixelType>(l0, avg12); __m128i avg3456 = mm_avg_epu<PixelType>(avg34, avg56); __m128i avg0123456 = mm_avg_epu<PixelType>(avg012, avg3456); __m128i avg = mm_avg_epu<PixelType>(avg012, avg0123456); _mm_storeu_si128((__m128i *)(dstp), avg); }
inline bool memequal_sse41_wide(const char * p1, const char * p2, size_t size) { __m128i zero16 = _mm_setzero_si128(); // const char * p1_end = p1 + size; while (size >= 64) { if (_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0]))) && _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1]))) && _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[2]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[2]))) && _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[3]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[3])))) { p1 += 64; p2 += 64; size -= 64; } else return false; } switch ((size % 64) / 16) { case 3: if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[2]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[2])))) return false; case 2: if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1])))) return false; case 1: if (!_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0])))) return false; } p1 += (size % 64) / 16 * 16; p2 += (size % 64) / 16 * 16; /* if (size >= 32) { if (_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0]))) & _mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[1]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[1])))) { p1 += 32; p2 += 32; size -= 32; } else return false; } if (size >= 16) { if (_mm_testc_si128( zero16, _mm_xor_si128( _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p1)[0]), _mm_loadu_si128(&reinterpret_cast<const __m128i *>(p2)[0])))) { p1 += 16; p2 += 16; size -= 16; } else return false; }*/ switch (size % 16) { case 15: if (p1[14] != p2[14]) return false; case 14: if (p1[13] != p2[13]) return false; case 13: if (p1[12] != p2[12]) return false; case 12: if (reinterpret_cast<const UInt32 *>(p1)[2] == reinterpret_cast<const UInt32 *>(p2)[2]) goto l8; else return false; case 11: if (p1[10] != p2[10]) return false; case 10: if (p1[9] != p2[9]) return false; case 9: if (p1[8] != p2[8]) return false; l8: case 8: return reinterpret_cast<const UInt64 *>(p1)[0] == reinterpret_cast<const UInt64 *>(p2)[0]; case 7: if (p1[6] != p2[6]) return false; case 6: if (p1[5] != p2[5]) return false; case 5: if (p1[4] != p2[4]) return false; case 4: return reinterpret_cast<const UInt32 *>(p1)[0] == reinterpret_cast<const UInt32 *>(p2)[0]; case 3: if (p1[2] != p2[2]) return false; case 2: return reinterpret_cast<const UInt16 *>(p1)[0] == reinterpret_cast<const UInt16 *>(p2)[0]; case 1: if (p1[0] != p2[0]) return false; case 0: break; } return true; }
static void warp_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int dst_stride, int width, int height, int depth_scalar) { int SMAG = 1 << SMAGL; __m128i depth = _mm_set1_epi32(depth_scalar << 8); depth = _mm_packs_epi32(depth, depth); const int16_t x_limit_min_array[8] = { (int16_t)(0 * SMAG), (int16_t)(-1 * SMAG), (int16_t)(-2 * SMAG), (int16_t)(-3 * SMAG), (int16_t)(-4 * SMAG), (int16_t)(-5 * SMAG), (int16_t)(-6 * SMAG), (int16_t)(-7 * SMAG) }; const int16_t x_limit_max_array[8] = { (int16_t)((width - 1) * SMAG), (int16_t)((width - 2) * SMAG), (int16_t)((width - 3) * SMAG), (int16_t)((width - 4) * SMAG), (int16_t)((width - 5) * SMAG), (int16_t)((width - 6) * SMAG), (int16_t)((width - 7) * SMAG), (int16_t)((width - 8) * SMAG) }; __m128i x_limit_min = _mm_loadu_si128((const __m128i *)x_limit_min_array); __m128i x_limit_max = _mm_loadu_si128((const __m128i *)x_limit_max_array); int width_sse2 = (width & ~7) + 2; if (width_sse2 > dst_stride) width_sse2 -= 8; __m128i zero = _mm_setzero_si128(); __m128i word_255 = _mm_setzero_si128(); word_255 = _mm_cmpeq_epi16(word_255, word_255); word_255 = _mm_srli_epi16(word_255, 8); __m128i word_127 = _mm_setzero_si128(); word_127 = _mm_cmpeq_epi16(word_127, word_127); word_127 = _mm_srli_epi16(word_127, 9); __m128i word_1 = _mm_setzero_si128(); word_1 = _mm_cmpeq_epi16(word_1, word_1); word_1 = _mm_srli_epi16(word_1, 15); __m128i one_stride = _mm_unpacklo_epi16(_mm_set1_epi16(src_stride), word_1); __m128i word_128 = _mm_setzero_si128(); word_128 = _mm_cmpeq_epi16(word_128, word_128); word_128 = _mm_slli_epi16(word_128, 15); word_128 = _mm_srli_epi16(word_128, 8); __m128i word_64 = _mm_setzero_si128(); word_64 = _mm_cmpeq_epi16(word_64, word_64); word_64 = _mm_slli_epi16(word_64, 15); word_64 = _mm_srli_epi16(word_64, 9); for (int y = 0; y < height; y++) { __m128i y_limit_min = _mm_set1_epi32(-y * 128); __m128i y_limit_max = _mm_set1_epi32((height - y) * 128 - 129); // (height - y - 1) * 128 - 1 y_limit_min = _mm_packs_epi32(y_limit_min, y_limit_min); y_limit_max = _mm_packs_epi32(y_limit_max, y_limit_max); warp_edge_c<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, width, height, 0, y, depth_scalar); for (int x = 1; x < width_sse2 - 1; x += 8) warp_mmword_u8_sse2<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, height, x, y, depth, zero, x_limit_min, x_limit_max, y_limit_min, y_limit_max, word_64, word_127, word_128, word_255, one_stride); if (width + 2 > width_sse2) warp_mmword_u8_sse2<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, height, width - 9, y, depth, zero, x_limit_min, x_limit_max, y_limit_min, y_limit_max, word_64, word_127, word_128, word_255, one_stride); warp_edge_c<SMAGL>(srcp, edgep, dstp, src_stride, edge_stride, width, height, width - 1, y, depth_scalar); srcp += src_stride * SMAG; edgep += edge_stride; dstp += dst_stride; } }
void aom_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); // add and saturate the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pitch; // shift down a row srcReg1 = srcReg2; srcReg2 = srcReg3; srcReg3 = srcReg4; srcReg4 = srcReg5; srcReg5 = srcReg6; srcReg6 = srcReg7; srcReg7 = srcReg8; // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += out_pitch; } }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = (const short*)_src.data; short* dst = (short*)_dst.data; size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) IppiSize sz = { roi.width, roi.height }; CV_SUPPRESS_DEPRECATED_START switch( type ) { case THRESH_TRUNC: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) return; #endif if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) return; setIppErrorStatus(); break; case THRESH_TOZERO: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0) return; #endif if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0) return; setIppErrorStatus(); break; case THRESH_TOZERO_INV: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) return; #endif if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) return; setIppErrorStatus(); break; } CV_SUPPRESS_DEPRECATED_END #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter, int w) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); int i; for (i = 0; i < w; i += 16) { const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); // add and saturate the results together const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); // merge the result together const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); // multiply 2 adjacent elements with the filter and add the result const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); // merge the result together const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); // multiply 2 adjacent elements with the filter and add the result const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); // add and saturate the results together __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); // add and saturate the results together temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); // round and shift by 7 bit each 16 bit temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); src_ptr += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } }
mlib_status __mlib_ImageBlend_SC_ONE( mlib_image *dst, const mlib_image *src1, const mlib_image *src2, mlib_s32 cmask) { BLEND_VALIDATE; dst_width *= channels; int k; __m128i *px, *py, *pz; __m128i dx, dy; /* upper - 1 lower - 0 */ __m128i dx_1, dx_0, dy_1, dy_0, dz_1, dz_0; __m128i dall_zero; dall_zero = _mm_setzero_si128(); if (0 == (((((mlib_addr) psrc1 | (mlib_addr)psrc2 | (mlib_addr)pdst)) & 0xf)) && (0 == (((src1_stride | src2_stride | dst_stride) & 0xf) || (1 == dst_height)))) { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 16; i += 16) { dx = _mm_load_si128(px); dy = _mm_load_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA(dx_1, dy_1, dz_1); PROCESS_DATA(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_store_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } else { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 16; i += 16) { dx = _mm_loadu_si128(px); dy = _mm_loadu_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA(dx_1, dy_1, dz_1); PROCESS_DATA(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_storeu_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } return (MLIB_SUCCESS); }
void ConvertVideoFrame420ToRGB( const th_info *tinfo, const th_ycbcr_buffer ycbcr, unsigned char* pixels ) { // some constant definitions const float single_yoffset = 16.0f; const float single_yexcursion = 219.0f; const float single_cboffset = 128.0f; const float single_cbexcursion = 224.0f; const float single_croffset = 128.0f; const float single_crexcursion = 224.0f; const float kr = 0.299f; const float kb = 0.114f; if (pixels) { const th_img_plane yplane = ycbcr[0]; const th_img_plane cbplane = ycbcr[1]; const th_img_plane crplane = ycbcr[2]; const int width = tinfo->pic_width; const int height = tinfo->pic_height; const int wh = width*height; assert(wh == yplane.width*yplane.height); assert(width % 16 == 0); assert(cbplane.width * 2 == yplane.width); assert(crplane.width * 2 == yplane.width); const unsigned char* ydata = yplane.data; const unsigned char* cbdata = cbplane.data; const unsigned char* crdata = crplane.data; const int ystride = yplane.stride; const int cbstride = cbplane.stride; const int crstride = crplane.stride; const __m128 yoffset = _mm_set_ps1(-single_yoffset); const __m128 yexcursion = _mm_set_ps1(1.0f / single_yexcursion); const __m128 cboffset = _mm_set_ps1(-single_cboffset); const __m128 cbexcursion = _mm_set_ps1(1.0f / single_cbexcursion); const __m128 croffset = _mm_set_ps1(-single_croffset); const __m128 crexcursion = _mm_set_ps1(1.0f / single_crexcursion); const __m128 fr = _mm_set_ps1(255.0f * 2 * (1 - kr)); const __m128 fb = _mm_set_ps1(255.0f * 2 * (1 - kb)); const __m128 f1 = _mm_set_ps1(255.0f * (2 * (1 - kb) * kb / (1 - kb - kr))); const __m128 f2 = _mm_set_ps1(255.0f * (2 * (1 - kr) * kr / (1 - kb - kr))); const __m128 c255 = _mm_set_ps1(255.0f); for(int h = 0; h < height; ++h) { for(int w = 0; w < width; w += 16) { const __m128i yIn = _mm_loadu_si128((const __m128i*)(ydata + h*ystride + w)); // assumption is that there is only one pixel in the cb/cr plane per 4 pixels (2x2) in the y plane const __m128i cbIn = _mm_loadu_si128((const __m128i*)(cbdata + h/2*cbstride + w/2)); const __m128i crIn = _mm_loadu_si128((const __m128i*)(crdata + h/2*crstride + w/2)); // yIn ep8 -> ps const __m128i yInlo = _mm_unpacklo_epi8((yIn), _mm_setzero_si128()); const __m128i yInHi = _mm_unpackhi_epi8((yIn), _mm_setzero_si128()); const __m128i yIn1 = _mm_unpacklo_epi16(yInlo, _mm_setzero_si128()); const __m128i yIn4 = _mm_unpackhi_epi16(yInlo, _mm_setzero_si128()); const __m128i yIn8 = _mm_unpacklo_epi16(yInHi, _mm_setzero_si128()); const __m128i yIn12 = _mm_unpackhi_epi16(yInHi, _mm_setzero_si128()); const __m128 yIn1ps = _mm_cvtepi32_ps(yIn1); const __m128 yIn2ps = _mm_cvtepi32_ps(yIn4); const __m128 yIn3ps = _mm_cvtepi32_ps(yIn8); const __m128 yIn4ps = _mm_cvtepi32_ps(yIn12); // cbIn ep8 -> ps const __m128i cbInExp = _mm_unpacklo_epi8(cbIn, cbIn); const __m128i cbInlo = _mm_unpacklo_epi8(cbInExp, _mm_setzero_si128()); const __m128i cbInHi = _mm_unpackhi_epi8(cbInExp, _mm_setzero_si128()); const __m128i cbIn1 = _mm_unpacklo_epi16(cbInlo, _mm_setzero_si128()); const __m128i cbIn4 = _mm_unpackhi_epi16(cbInlo, _mm_setzero_si128()); const __m128i cbIn8 = _mm_unpacklo_epi16(cbInHi, _mm_setzero_si128()); const __m128i cbIn12 = _mm_unpackhi_epi16(cbInHi, _mm_setzero_si128()); const __m128 cbIn1ps = _mm_cvtepi32_ps(cbIn1); const __m128 cbIn2ps = _mm_cvtepi32_ps(cbIn4); const __m128 cbIn3ps = _mm_cvtepi32_ps(cbIn8); const __m128 cbIn4ps = _mm_cvtepi32_ps(cbIn12); // crIn ep8 -> ps const __m128i crInExp = _mm_unpacklo_epi8(crIn, crIn); const __m128i crInlo = _mm_unpacklo_epi8(crInExp, _mm_setzero_si128()); const __m128i crInHi = _mm_unpackhi_epi8(crInExp, _mm_setzero_si128()); const __m128i crIn1 = _mm_unpacklo_epi16(crInlo, _mm_setzero_si128()); const __m128i crIn4 = _mm_unpackhi_epi16(crInlo, _mm_setzero_si128()); const __m128i crIn8 = _mm_unpacklo_epi16(crInHi, _mm_setzero_si128()); const __m128i crIn12 = _mm_unpackhi_epi16(crInHi, _mm_setzero_si128()); const __m128 crIn1ps = _mm_cvtepi32_ps(crIn1); const __m128 crIn2ps = _mm_cvtepi32_ps(crIn4); const __m128 crIn3ps = _mm_cvtepi32_ps(crIn8); const __m128 crIn4ps = _mm_cvtepi32_ps(crIn12); // map [0..255] to [-1/2..+1/2] resp. [0..1] const __m128 yOut1ps = _mm_mul_ps(_mm_add_ps(yIn1ps, yoffset), yexcursion); const __m128 yOut2ps = _mm_mul_ps(_mm_add_ps(yIn2ps, yoffset), yexcursion); const __m128 yOut3ps = _mm_mul_ps(_mm_add_ps(yIn3ps, yoffset), yexcursion); const __m128 yOut4ps = _mm_mul_ps(_mm_add_ps(yIn4ps, yoffset), yexcursion); const __m128 cbOut1ps = _mm_mul_ps(_mm_add_ps(cbIn1ps, cboffset), cbexcursion); const __m128 cbOut2ps = _mm_mul_ps(_mm_add_ps(cbIn2ps, cboffset), cbexcursion); const __m128 cbOut3ps = _mm_mul_ps(_mm_add_ps(cbIn3ps, cboffset), cbexcursion); const __m128 cbOut4ps = _mm_mul_ps(_mm_add_ps(cbIn4ps, cboffset), cbexcursion); const __m128 crOut1ps = _mm_mul_ps(_mm_add_ps(crIn1ps, croffset), crexcursion); const __m128 crOut2ps = _mm_mul_ps(_mm_add_ps(crIn2ps, croffset), crexcursion); const __m128 crOut3ps = _mm_mul_ps(_mm_add_ps(crIn3ps, croffset), crexcursion); const __m128 crOut4ps = _mm_mul_ps(_mm_add_ps(crIn4ps, croffset), crexcursion); // do the actual conversion math (on range 0..255/-127..127 instead or 0..1/-1/2..+1/2 const __m128 y1_255 = _mm_mul_ps(c255, yOut1ps); const __m128 y2_255 = _mm_mul_ps(c255, yOut2ps); const __m128 y3_255 = _mm_mul_ps(c255, yOut3ps); const __m128 y4_255 = _mm_mul_ps(c255, yOut4ps); const __m128 r1_1 = _mm_add_ps(y1_255, _mm_mul_ps(fr, crOut1ps)); const __m128 r2_1 = _mm_add_ps(y2_255, _mm_mul_ps(fr, crOut2ps)); const __m128 r3_1 = _mm_add_ps(y3_255, _mm_mul_ps(fr, crOut3ps)); const __m128 r4_1 = _mm_add_ps(y4_255, _mm_mul_ps(fr, crOut4ps)); const __m128 g1_1 = _mm_sub_ps(_mm_sub_ps(y1_255, _mm_mul_ps(f1, cbOut1ps)), _mm_mul_ps(f2, crOut1ps)); const __m128 g2_1 = _mm_sub_ps(_mm_sub_ps(y2_255, _mm_mul_ps(f1, cbOut2ps)), _mm_mul_ps(f2, crOut2ps)); const __m128 g3_1 = _mm_sub_ps(_mm_sub_ps(y3_255, _mm_mul_ps(f1, cbOut3ps)), _mm_mul_ps(f2, crOut3ps)); const __m128 g4_1 = _mm_sub_ps(_mm_sub_ps(y4_255, _mm_mul_ps(f1, cbOut4ps)), _mm_mul_ps(f2, crOut4ps)); const __m128 b1_1 = _mm_add_ps(y1_255, _mm_mul_ps(fb, cbOut1ps)); const __m128 b2_1 = _mm_add_ps(y2_255, _mm_mul_ps(fb, cbOut2ps)); const __m128 b3_1 = _mm_add_ps(y3_255, _mm_mul_ps(fb, cbOut3ps)); const __m128 b4_1 = _mm_add_ps(y4_255, _mm_mul_ps(fb, cbOut4ps)); // clip to 255 const __m128 r1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r1_1)); const __m128 r2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r2_1)); const __m128 r3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r3_1)); const __m128 r4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r4_1)); const __m128 g1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g1_1)); const __m128 g2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g2_1)); const __m128 g3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g3_1)); const __m128 g4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g4_1)); const __m128 b1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b1_1)); const __m128 b2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b2_1)); const __m128 b3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b3_1)); const __m128 b4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b4_1)); // multiplex rgb channels #define rgb_multiplex(no) \ const __m128 rgb##no##_1 = _mm_shuffle_ps( \ _mm_shuffle_ps(b##no##, g##no##, _MM_SHUFFLE(0, 0, 0, 0)), \ _mm_shuffle_ps(r##no##, b##no##, _MM_SHUFFLE(1, 1, 0, 0)), \ _MM_SHUFFLE(2, 0, 2, 0)); \ const __m128 rgb##no##_2 = _mm_shuffle_ps( \ _mm_shuffle_ps(g##no##, r##no##, _MM_SHUFFLE(1, 1, 1, 1)), \ _mm_shuffle_ps(b##no##, g##no##, _MM_SHUFFLE(2, 2, 2, 2)), \ _MM_SHUFFLE(2, 0, 2, 0)); \ const __m128 rgb##no##_3 = _mm_shuffle_ps( \ _mm_shuffle_ps(r##no##, b##no##, _MM_SHUFFLE(3, 3, 2, 2)), \ _mm_shuffle_ps(g##no##, r##no##, _MM_SHUFFLE(3, 3, 3, 3)), \ _MM_SHUFFLE(2, 0, 2, 0)); rgb_multiplex(1); rgb_multiplex(2); rgb_multiplex(3); rgb_multiplex(4); #undef rgb_multiplex // pack 32bit -> 8bit const __m128i pack1l = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_1), _mm_cvtps_epi32(rgb1_2)); const __m128i pack1h = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_3), _mm_cvtps_epi32(rgb2_1)); const __m128i pack1 = _mm_packus_epi16(pack1l, pack1h); const __m128i pack2l = _mm_packs_epi32(_mm_cvtps_epi32(rgb2_2), _mm_cvtps_epi32(rgb2_3)); const __m128i pack2h = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_1), _mm_cvtps_epi32(rgb3_2)); const __m128i pack2 = _mm_packus_epi16(pack2l, pack2h); const __m128i pack3l = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_3), _mm_cvtps_epi32(rgb4_1)); const __m128i pack3h = _mm_packs_epi32(_mm_cvtps_epi32(rgb4_2), _mm_cvtps_epi32(rgb4_3)); const __m128i pack3 = _mm_packus_epi16(pack3l, pack3h); // and finally store in output _mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 0*16), pack1); _mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 1*16), pack2); _mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 2*16), pack3); } } } // if }
void Compress(hashState *ctx, const unsigned char *pmsg, unsigned int uBlockCount) { unsigned int r, b, i, j; __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp; __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; for(i = 0; i < 4; i++) for(j = 0; j < ctx->uHashSize / 256; j++) _state[i][j] = ctx->state[i][j]; #ifndef AES_NI // transform cv for(i = 0; i < 4; i++) for(j = 0; j < ctx->uHashSize / 256; j++) { TRANSFORM(_state[i][j], _k_ipt, t1, t2); } #endif for(b = 0; b < uBlockCount; b++) { ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); // load message for(j = ctx->uHashSize / 256; j < 4; j++) { for(i = 0; i < 4; i++) { _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); #ifndef AES_NI // transform message TRANSFORM(_state[i][j], _k_ipt, t1, t2); #endif } } // save state SAVESTATE(_statebackup, _state); k1 = ctx->k; #ifdef AES_NI for(r = 0; r < ctx->uRounds / 2; r++) { ECHO_ROUND_UNROLL2; } #else for(r = 0; r < ctx->uRounds / 2; r++) { _state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero); _state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero); _state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero); _state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero); ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2); _state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero); _state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero); _state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero); _state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero); ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2); } #endif if(ctx->uHashSize == 256) { for(i = 0; i < 4; i++) { _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); } } else { for(i = 0; i < 4; i++) { _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); } } pmsg += ctx->uBlockLength; } #ifndef AES_NI // transform state for(i = 0; i < 4; i++) for(j = 0; j < 4; j++) { TRANSFORM(_state[i][j], _k_opt, t1, t2); } #endif SAVESTATE(ctx->state, _state); }
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, int N) { opus_int i, dataSize16; opus_int32 sum; __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; __m128i inVec1_3210, inVec2_3210; sum = 0; dataSize16 = N & ~15; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for (i=0;i<dataSize16;i+=16) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); acc1 = _mm_add_epi32(acc1, inVec1_76543210); acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); } acc1 = _mm_add_epi32(acc1, acc2); if (N - i >= 8) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); acc1 = _mm_add_epi32(acc1, inVec1_76543210); i += 8; } if (N - i >= 4) { inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); acc1 = _mm_add_epi32(acc1, inVec1_3210); i += 4; } acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); sum += _mm_cvtsi128_si32(acc1); for (;i<N;i++) { sum = silk_SMLABB(sum, x[i], y[i]); } return sum; }