// c = a - b FORCE_INLINE int __ext_v_sub_complex16(struct complex16* c, int len, struct complex16* a, int __unused_2, struct complex16* b, int __unused_1) { const int wlen = 4;// sizeof(vcs) / sizeof(complex16); __m128i* As = (__m128i*) a; __m128i* Bs = (__m128i*) b; __m128i* Cs = (__m128i*) c; for (int i = 0; i < len / wlen; i++) { __m128i ma = _mm_loadu_si128(&As[i]); __m128i mb = _mm_loadu_si128(&Bs[i]); _mm_storeu_si128(&Cs[i], _mm_sub_epi16(ma, mb)); } for (int i = (len / wlen) * wlen; i < len; i++) { c[i].re = a[i].re - b[i].re; c[i].im = a[i].im - b[i].im; } return 0; }
//FINL int __ext_v_shift_right_complex32(struct complex32* z, int __unused_3, struct complex32* x, int len, int shift) { const int wlen = 2;// sizeof(vci) / sizeof(complex32); for (int i = 0; i < len / wlen; i++) {/* vci *xi = (vci *)(x + wlen*i); vci output = (shift_right(*xi, shift)); memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vci));*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); _mm_storeu_si128((__m128i *) (z + wlen*i), _mm_srai_epi32(mx, shift)); } for (int i = (len / wlen) * wlen; i < len; i++) { z[i].re = x[i].re >> shift; z[i].im = x[i].im >> shift; } return 0; }
FORCE_INLINE int __ext_v_add_complex32(struct complex32* c, int len, struct complex32* a, int __unused_2, struct complex32* b, int __unused_1) { const int wlen = 2; // sizeof(vci) / sizeof(complex32); __m128i* As = (__m128i*) a; __m128i* Bs = (__m128i*) b; __m128i* Cs = (__m128i*) c; for (int i = 0; i < len / wlen; i++) { __m128i ma = _mm_loadu_si128(&As[i]); __m128i mb = _mm_loadu_si128(&Bs[i]); _mm_storeu_si128(&Cs[i], _mm_add_epi32(ma, mb)); } for (int i = (len / wlen) * wlen; i < len; i++) { c[i].re = a[i].re + b[i].re; c[i].im = a[i].im + b[i].im; } return 0; }
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7 const __m128i g_lo1 = _mm_srli_epi16(ga0, 5); const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b) const __m128i g_hi1 = _mm_slli_epi16(ga0, 3); const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b) const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0 const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx const __m128i b1 = _mm_srli_epi16(b0, 3); const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx #if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 #else const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 #endif _mm_storeu_si128(out++, rgba); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
// Multiply the first source vector by the conjugate of the second source vector // ie. re + j * im = a * conj(b) //Return by reference for performance FORCE_INLINE int __ext_v_conj_mul_complex16(struct complex16* out, int lenout, struct complex16* x, int len1, struct complex16* y, int len2, int shift){ const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Outs = (__m128i*) out; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_srai_epi32(_mm_madd_epi16(my, mx), shift); __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, mx), shift); mre = _mm_and_si128(mre, xmm6); mim = _mm_and_si128(mim, xmm6); mim = _mm_slli_epi32(mim, 0x10); _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ out[i].re = (x[i].re * y[i].re + x[i].im * y[i].im) >> shift; out[i].im = (x[i].im * y[i].re - x[i].re * y[i].im) >> shift; } return 0; }
static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f, int tapsNum, uint32_t *buf) { __m128i u[8], v[6]; assert(tapsNum == 10 || tapsNum == 12); if (tapsNum == 10) { src -= 1; } u[0] = _mm_loadu_si128((__m128i const *)src); u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); u[4] = _mm_loadu_si128((__m128i const *)(src + 8)); u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8)); u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8)); u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8)); transpose_pair(u, v); u[0] = _mm_madd_epi16(v[0], f[0]); u[1] = _mm_madd_epi16(v[1], f[1]); u[2] = _mm_madd_epi16(v[2], f[2]); u[3] = _mm_madd_epi16(v[3], f[3]); u[4] = _mm_madd_epi16(v[4], f[4]); u[5] = _mm_madd_epi16(v[5], f[5]); u[6] = _mm_min_epi32(u[2], u[3]); u[7] = _mm_max_epi32(u[2], u[3]); u[0] = _mm_add_epi32(u[0], u[1]); u[0] = _mm_add_epi32(u[0], u[5]); u[0] = _mm_add_epi32(u[0], u[4]); u[0] = _mm_add_epi32(u[0], u[6]); u[0] = _mm_add_epi32(u[0], u[7]); _mm_storeu_si128((__m128i *)buf, u[0]); }
// TODO: These should eat clock cycles. static void gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size) { u8* dst = ((u8*)g_dsp.dram); #if _M_SSE >= 0x301 if (cpu_info.bSSSE3 && !(size % 16)) { for (u32 i = 0; i < size; i += 16) { _mm_storeu_si128((__m128i *)&dst[dsp_addr + i], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]), s_mask)); } } else #endif { for (u32 i = 0; i < size; i += 2) { *(u16*)&dst[dsp_addr + i] = Common::swap16(*(const u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]); } } INFO_LOG(DSPLLE, "*** ddma_in RAM (0x%08x) -> DRAM_DSP (0x%04x) : size (0x%08x)", addr, dsp_addr / 2, size); }
static void AESNI_CBC_decrypt(const unsigned char *in,unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds) { __m128i data,feedback,last_in; int i,j; if (length%16) length = length/16+1; else length /=16; feedback=_mm_loadu_si128 ((__m128i*)ivec); for(i=0; i < length; i++) { last_in=_mm_loadu_si128 (&((__m128i*)in)[i]); data = _mm_xor_si128 (last_in,((__m128i*)key)[0]); for(j=1; j <number_of_rounds; j++) { data = _mm_aesdec_si128 (data,((__m128i*)key)[j]); } data = _mm_aesdeclast_si128 (data,((__m128i*)key)[j]); data = _mm_xor_si128 (data,feedback); _mm_storeu_si128 (&((__m128i*)out)[i],data); feedback=last_in; } }
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_cvtsi32_si128(a); const __m128i B0 = _mm_cvtsi32_si128(b); const __m128i C0 = _mm_cvtsi32_si128(c); const __m128i AC0 = _mm_subs_epu8(A0, C0); const __m128i CA0 = _mm_subs_epu8(C0, A0); const __m128i BC0 = _mm_subs_epu8(B0, C0); const __m128i CB0 = _mm_subs_epu8(C0, B0); const __m128i AC = _mm_or_si128(AC0, CA0); const __m128i BC = _mm_or_si128(BC0, CB0); const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c| const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| const __m128i diff = _mm_sub_epi16(pb, pa); { int16_t out[8]; _mm_storeu_si128((__m128i*)out, diff); pa_minus_pb = out[0] + out[1] + out[2] + out[3]; } return (pa_minus_pb <= 0) ? a : b; }
//FINL int __ext_v_shift_right_int16(int16* z, int __unused_3, int16* x, int len, int shift) { const int wlen = 8;// sizeof(vs) / sizeof(int16); for (int i = 0; i < len / wlen; i++) { /* vs *xi = (vs *)(x + wlen*i); vs output = (shift_right(*xi, shift)); memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vs));*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); _mm_storeu_si128((__m128i *) (z + wlen*i), _mm_srai_epi16(mx, shift)); } for (int i = (len / wlen) * wlen; i < len; i++) { z[i] = x[i] >> shift; } return 0; }
/* Routine optimized for shuffling a buffer for a type size of 8 bytes. */ static void shuffle8_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t j; int k, l; uint8_t* dest_for_jth_element; __m128i xmm0[8], xmm1[8]; for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { /* Fetch 16 elements (128 bytes) then transpose bytes. */ for (k = 0; k < 8; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); } /* Transpose words */ for (k = 0, l = 0; k < 4; k++, l += 2) { xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 1]); xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 1]); } /* Transpose double words */ for (k = 0, l = 0; k < 4; k++, l++) { if (k == 2) l += 2; xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 2]); xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 2]); } /* Transpose quad words */ for (k = 0; k < 4; k++) { xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 4]); xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 4]); } /* Store the result vectors */ dest_for_jth_element = dest + j; for (k = 0; k < 8; k++) { _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); } } }
void __ext_v_andnot(unsigned char *output, int outlen, unsigned char *input1, int inlen1, unsigned char *input2, int inlen2) { int cnt = 0; int bytelen1 = inlen1 / 8 + ((inlen1 % 8) > 0); while (cnt + 16 <= bytelen1) { __m128i mi1 = _mm_loadu_si128((__m128i *) (input1 + cnt)); __m128i mi2 = _mm_loadu_si128((__m128i *) (input2 + cnt)); _mm_storeu_si128((__m128i *) (output + cnt), _mm_andnot_si128(mi1, mi2)); cnt += 16; } while (cnt < bytelen1) { output[cnt] = (~input1[cnt]) & input2[cnt]; cnt++; } outlen = inlen1; }
/* The GCM counter. Counts on the last 32 bits, ignoring carry. */ static inline void _nc_count_16_be_4 (uint64_t *init, uint64_t *dst, size_t blocks) { #if defined (__nc_SSE__) __m128i ctr, c1 = _mm_set_epi32 (1, 0, 0, 0), mask = _mm_set_epi64x (0x0c0d0e0f0b0a0908, 0x0706050403020100); ctr = _mm_shuffle_epi8 (_mm_loadu_si128 ((__m128i *) init), mask); for (; blocks --; dst += 2) { _mm_storeu_si128 ((__m128i *) dst, _mm_shuffle_epi8 (ctr, mask)); ctr = _mm_add_epi32 (ctr, c1); } #else uint64_t qw1 = init[0]; uint32_t dw3 = ((uint32_t*) init)[2], dw4 = be32toh (((uint32_t*) init)[3]); for (; blocks --; dst += 2) { dst[0] = qw1; ((uint32_t*) dst)[2] = dw3; ((uint32_t*) dst)[3] = htobe32 (dw4 ++); } #endif }
void InvShiftRows_sse(BYTE state[][4]) { __m128i stateSse = _mm_set_epi8(state[3][3], state[3][2], state[3][1], state[3][0], state[2][3], state[2][2], state[2][1], state[2][0], state[1][3], state[1][2], state[1][1], state[1][0], state[0][3], state[0][2], state[0][1], state[0][0]); __m128i shuffleVar = _mm_set_epi8(12, 15, 14, 13, 9, 8, 11, 10, 6, 5, 4, 7, 3, 2, 1, 0); __m128i shuffledState = _mm_shuffle_epi8(stateSse, shuffleVar); _mm_storeu_si128(state, shuffledState); }
void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) { __m128i tot; __m128i tout[8]; const struct blocks8 *blks; struct blocks8 *top; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { blks = (const struct blocks8 *)from; top = (struct blocks8 *)to; aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], blks->blk[6], blks->blk[7], tout); top->blk[0] = tout[0]; top->blk[1] = tout[1]; top->blk[2] = tout[2]; top->blk[3] = tout[3]; top->blk[4] = tout[4]; top->blk[5] = tout[5]; top->blk[6] = tout[6]; top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tot = aesni_dec(rounds - 1, key_schedule, _mm_loadu_si128((const __m128i *)from)); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } }
void png_read_filter_row_sub3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; __m128i racc = _mm_setzero_si128(); PNG_UNUSED(prev_row) __m128i nrb = _mm_load_si128((__m128i*)(rp)); for (i = 0; i < row_info->rowbytes; i += 15, rp += 15) { __m128i rb = nrb; #ifndef __SSSE3__ nrb = _mm_loadu_si128((__m128i*)(rp + 15)); racc = _mm_srli_si128(_mm_slli_si128(racc, 1), 13); racc = _mm_or_si128(racc, _mm_slli_si128(rb, 3)); #else nrb = _mm_lddqu_si128((__m128i*)(rp + 15)); racc = _mm_alignr_epi8(rb, _mm_slli_si128(racc, 1), 13); #endif rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = rb; _mm_storeu_si128((__m128i*)rp, rb); } }
static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks const __m128i red_mask = _mm_set1_epi32(0x00ff0000); const __m128i green_mask = _mm_set1_epi32(0x0000ff00); const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); const __m128i b = in; const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red const __m128i r_new = _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask); const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new); const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); const __m128i b_new = _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask); const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // Fall-back to C-version for left-overs. VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); }
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) { #ifdef _M_SSE // Size will always be 16-byte aligned as the hwBlockSize is. while (size >= 8) { __m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i packed = _mm_packs_epi32(in1, in2); if (useShift) { packed = _mm_srai_epi16(packed, volShift); } _mm_storeu_si128((__m128i *)out, packed); out += 8; in += 8; size -= 8; } #elif PPSSPP_ARCH(ARM_NEON) int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer while (size >= 8) { int32x4_t in1 = vld1q_s32(in); int32x4_t in2 = vld1q_s32(in + 4); int16x4_t packed1 = vqmovn_s32(in1); int16x4_t packed2 = vqmovn_s32(in2); if (useShift) { packed1 = vshl_s16(packed1, signedVolShift); packed2 = vshl_s16(packed2, signedVolShift); } vst1_s16(out, packed1); vst1_s16(out + 4, packed2); out += 8; in += 8; size -= 8; } #endif // This does the remainder if SIMD was used, otherwise it does it all. for (size_t i = 0; i < size; i++) { out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]); } }
//FINL int __ext_v_shift_right_complex16(struct complex16* z, int __unused_3, struct complex16* x, int len, int shift) { const int wlen = 4;// sizeof(vcs) / sizeof(complex16); for (int i = 0; i < len / wlen; i++) { //vcs *xi = (vcs *)(x + wlen*i); //vcs output = (shift_right(*xi, shift)); //memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vcs)); __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); _mm_storeu_si128((__m128i *) (z + wlen*i), _mm_srai_epi16(mx,shift)); } for (int i = (len / wlen) * wlen; i < len; i++) { z[i].re = x[i].re >> shift; z[i].im = x[i].im >> shift; } return 0; }
// This function is called in code/WiFi/receiver/downSample.blk //FINL int __ext_interleave_loww( struct complex16* x, int __unused_5, struct complex16* y, int __unused_4, struct complex16* z, int __unused_3) { assert (__unused_4 == 4); assert (__unused_3 == 4); assert (__unused_5 == 4); /*vcs t1 = *( (vcs*)x ); vcs t2 = *( (vcs*)y ); vcs *po = (vcs *)z; *po = (vcs)(interleave_low ((vcui&)t1, (vcui&)t2));*/ __m128i mx = _mm_loadu_si128((__m128i *)x); __m128i my = _mm_loadu_si128((__m128i *)y); _mm_storeu_si128((__m128i *) z, _mm_unpacklo_epi64(mx,my)); return 0; }
static FORCE_INLINE void blur_r6_h_right_sse2(const PixelType *srcp, PixelType *dstp) { __m128i avg12 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 1)), _mm_loadu_si128((const __m128i *)(srcp - 2))); __m128i avg34 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 3)), _mm_loadu_si128((const __m128i *)(srcp - 4))); __m128i avg56 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp - 5)), _mm_loadu_si128((const __m128i *)(srcp - 6))); __m128i avg012 = mm_avg_epu<PixelType>(_mm_loadu_si128((const __m128i *)(srcp)), avg12); __m128i avg3456 = mm_avg_epu<PixelType>(avg34, avg56); __m128i avg0123456 = mm_avg_epu<PixelType>(avg012, avg3456); __m128i avg = mm_avg_epu<PixelType>(avg012, avg0123456); // This is the right edge. Only the highest six pixels are needed. if (sizeof(PixelType) == 1) { int extra_bytes = *(int16_t *)(dstp + 8); avg = _mm_insert_epi16(avg, extra_bytes, 4); _mm_storeh_pi((__m64 *)(dstp + 8), _mm_castsi128_ps(avg)); } else { int extra_bytes = dstp[0]; avg = _mm_insert_epi16(avg, extra_bytes, 0); extra_bytes = dstp[1]; avg = _mm_insert_epi16(avg, extra_bytes, 1); _mm_storeu_si128((__m128i *)(dstp), avg); } }
int aesni_xcryptecb( aes_context *ctx, int mode, const unsigned char input[16], unsigned char output[16] ) { __m128i block; const __m128i *subkeys = (__m128i *) ctx->rk; const int rounds = ctx->nr; int i; /* This could be faster if more data was provided at once. */ block = _mm_loadu_si128( (__m128i *) input ); block = _mm_xor_si128( block, subkeys[0] ); if( mode == AES_ENCRYPT ) { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesenc_si128( block, subkeys[i] ); block = _mm_aesenc_si128( block, subkeys[i + 1] ); } block = _mm_aesenc_si128( block, subkeys[rounds - 1] ); block = _mm_aesenclast_si128( block, subkeys[rounds] ); } else { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesdec_si128( block, subkeys[i] ); block = _mm_aesdec_si128( block, subkeys[i + 1] ); } block = _mm_aesdec_si128( block, subkeys[rounds - 1] ); block = _mm_aesdeclast_si128( block, subkeys[rounds] ); } _mm_storeu_si128( (__m128i *) output, block ); return( 0 ); }
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int eighthPoints = len / 8; const float* inputVectorPtr = (const float*)x; int16_t* outputVectorPtr = z; __m128 vScalar = _mm_set_ps1(scale); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; __m128 ret1, ret2; for(;number < eighthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ret1 = _mm_mul_ps(inputVal1, vScalar); ret2 = _mm_mul_ps(inputVal2, vScalar); intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; } number = eighthPoints * 8; for(; number < len; number++){ z[number] = (int16_t) (x[number] * scale); } #endif }
static unsigned reg_sad_sse2(const pixel * const data1, const pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { int y, x; unsigned sad = 0; __m128i sse_inc = _mm_setzero_si128 (); long long int sse_inc_array[2]; for (y = 0; y < height; ++y) { for (x = 0; x <= width-16; x+=16) { const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b)); } for (; x < width; ++x) { sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc); sad += sse_inc_array[0] + sse_inc_array[1]; return sad; }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; uint8_t* dest_for_ith_element; __m128i xmm0[4], xmm1[4]; for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { /* Fetch 16 elements (64 bytes) then transpose bytes and words. */ for (j = 0; j < 4; j++) { xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i)))); xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8); xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d); xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]); xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e); xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]); } /* Transpose double words */ for (j = 0; j < 2; j++) { xmm1[j * 2] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); xmm1[j * 2 + 1] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); } /* Transpose quad words */ for (j = 0; j < 2; j++) { xmm0[j * 2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j + 2]); xmm0[j * 2 + 1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j + 2]); } /* Store the result vectors */ dest_for_ith_element = dest + i; for (j = 0; j < 4; j++) { _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]); } } }
// Predictor1: left. static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; __m128i prev = _mm_set1_epi32(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { // a | b | c | d const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); // 0 | a | b | c const __m128i shift0 = _mm_slli_si128(src, 4); // a | a + b | b + c | c + d const __m128i sum0 = _mm_add_epi8(src, shift0); // 0 | 0 | a | a + b const __m128i shift1 = _mm_slli_si128(sum0, 8); // a | a + b | a + b + c | a + b + c + d const __m128i sum1 = _mm_add_epi8(sum0, shift1); const __m128i res = _mm_add_epi8(sum1, prev); _mm_storeu_si128((__m128i*)&out[i], res); // replicate prev output on the four lanes prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6)); } if (i != num_pixels) { VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); } }
//vz optimized template specialization template<> void cvtScale_<short, short, float>( const short* src, size_t sstep, short* dst, size_t dstep, Size size, float scale, float shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; #if CV_SSE2 if(USE_SSE2) { __m128 scale128 = _mm_set1_ps (scale); __m128 shift128 = _mm_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); r0 = _mm_cvtps_epi32(rf0); r1 = _mm_cvtps_epi32(rf1); r0 = _mm_packs_epi32(r0, r1); _mm_storeu_si128((__m128i*)(dst + x), r0); } } #endif for(; x < size.width; x++ ) dst[x] = saturate_cast<short>(src[x]*scale + shift); } }
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa, pb; GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL| GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL| { const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T const __m128i res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } } if (i != num_pixels) { VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); } }
static inline void xor_into (uint8_t *src, uint8_t *dst, size_t n) { #if defined (__nc_SSE2__) while (n >= 16) { _mm_storeu_si128 ( (__m128i*) dst, _mm_xor_si128 ( _mm_loadu_si128 ((__m128i*) src), _mm_loadu_si128 ((__m128i*) dst))); src += 16; dst += 16; n -= 16; } #endif while (n >= u_long_s) { *((u_long *) dst) ^= *((u_long *) src); src += u_long_s; dst += u_long_s; n -= u_long_s; } while (n-- > 0) { *dst = *(src ++) ^ *dst; dst++; } }
int64_t get_sum_vectorised (int64_t * vector) { __m128i sum = _mm_setzero_si128(); int64_t actualSum = 0; for (int64_t i = 0; i < g_length/4*4; i += 4) { __m128i temp = _mm_loadu_si128((__m128i *)(vector + i)); sum = _mm_add_epi64(sum, temp); } int64_t A[4] = {0,0,0,0}; _mm_storeu_si128((__m128i *)A, sum); actualSum += A[0] + A[1] + A[2] + A[3]; for (int64_t i = g_length/4*4; i < g_length; i++) { actualSum += vector[i]; } return actualSum; }