void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); const __m128i IXABCDEF = _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); const __m128i XIJKLMNO = _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); __m128i rowa = avg2; __m128i rowb = avg3; int i; (void)bd; for (i = 0; i < 8; i += 2) { _mm_store_si128((__m128i *)dst, rowa); dst += stride; _mm_store_si128((__m128i *)dst, rowb); dst += stride; rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); } }
void __stdcall planar_shader_to_rgb32_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* sr = srcp[0]; const uint8_t* sg = srcp[1]; const uint8_t* sb = srcp[2]; uint8_t* d = dstp[0] + (height - 1) * dpitch; float* bb = reinterpret_cast<float*>(_buff); float* bg = bb + ((width + 7) & ~7); // must be aligned 32 bytes float* br = bg + ((width + 7) & ~7); // must be aligned 32 bytes const __m128 coef = _mm_set1_ps(255.0f); const __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; ++y) { convert_half_to_float(br, sr, width); convert_half_to_float(bg, sg, width); convert_half_to_float(bb, sb, width); for (int x = 0; x < width; x += 4) { __m128i b = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bb + x))); __m128i g = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bg + x))); __m128i r = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(br + x))); __m128i bgra = _mm_or_si128(b, _mm_slli_si128(g, 1)); bgra = _mm_or_si128(bgra, _mm_slli_si128(r, 2)); _mm_stream_si128(reinterpret_cast<__m128i*>(d + x * 4), bgra); } sr += spitch; sg += spitch; sb += spitch; d -= dpitch; } }
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) { if(rcon) { input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15), input2); *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate input1 = _mm_alignr_epi8(input1, input1, 1); } __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4)); smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B)); __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4); input1 = _mm_and_si128(low_nibs, input1); __m128i t2 = _mm_shuffle_epi8(k_inv2, input1); input1 = _mm_xor_si128(input1, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1)); __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); return mm_xor3(_mm_shuffle_epi8(sb1u, t5), _mm_shuffle_epi8(sb1t, t6), smeared); }
inline void Cryptor::assistKey192(__m128i *tmp, __m128i *tmp2, __m128i *tmp3) { // Duplicate the 2nd 32-bit part 4 times: // [1, 2, 3, 4] -> [2, 2, 2, 2] __m128i tmp4; *tmp2 = _mm_shuffle_epi32(*tmp2, SHUFFLE4_32(1, 1, 1, 1)); tmp4 = _mm_slli_si128(*tmp, 0x4); *tmp = _mm_xor_si128(*tmp, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x4); *tmp = _mm_xor_si128(*tmp, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x4); *tmp = _mm_xor_si128(*tmp, tmp4); *tmp = _mm_xor_si128(*tmp, *tmp2); // Duplicate the 4th 32-bit part 4 times. *tmp2 = _mm_shuffle_epi32(*tmp, SHUFFLE4_32(3, 3, 3, 3)); tmp4 = _mm_slli_si128(*tmp3, 0x4); *tmp3 = _mm_xor_si128(*tmp3, tmp4); *tmp3 = _mm_xor_si128(*tmp3, *tmp2); }
void png_read_filter_row_sub4_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; __m128i racc = _mm_setzero_si128(); __m128i* rp = (__m128i*)(row); PNG_UNUSED(prev_row) for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--) { __m128i rb = _mm_load_si128(rp); #ifndef __SSSE3__ racc = _mm_srli_si128(racc, 12); racc = _mm_or_si128(racc, _mm_slli_si128(rb, 4)); #else racc = _mm_alignr_epi8(rb, racc, 12); #endif rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 4); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 4); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 4); rb = _mm_add_epi8(rb, racc); racc = rb; _mm_store_si128(rp++, rb); } }
void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, uint32_t out[], bool last) { __m128i key1 = *K1; __m128i key2 = *K2; key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); key1 = _mm_xor_si128(key1, key2_with_rcon); *K1 = key1; _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); if(last) return; key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); *K2 = key2; out[4] = _mm_cvtsi128_si32(key2); out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); }
// Special case for left-based prediction (when preds==dst-1 or preds==src-1). static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length, int inverse) { int i; if (length <= 0) return; if (inverse) { const int max_pos = length & ~7; __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]); for (i = 0; i < max_pos; i += 8) { const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i)); const __m128i A1 = _mm_add_epi8(A0, last); const __m128i A2 = _mm_slli_si128(A1, 1); const __m128i A3 = _mm_add_epi8(A1, A2); const __m128i A4 = _mm_slli_si128(A3, 2); const __m128i A5 = _mm_add_epi8(A3, A4); const __m128i A6 = _mm_slli_si128(A5, 4); const __m128i A7 = _mm_add_epi8(A5, A6); _mm_storel_epi64((__m128i*)(dst + i), A7); last = _mm_srli_epi64(A7, 56); } for (; i < length; ++i) dst[i] = src[i] + dst[i - 1]; } else { const int max_pos = length & ~31; for (i = 0; i < max_pos; i += 32) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); const __m128i C0 = _mm_sub_epi8(A0, B0); const __m128i C1 = _mm_sub_epi8(A1, B1); _mm_storeu_si128((__m128i*)(dst + i + 0), C0); _mm_storeu_si128((__m128i*)(dst + i + 16), C1); } for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; } }
/* same as mul5, but assumes {d,2} contains a[4]*b[4] */ GF2X_STORAGE_CLASS_mul5 void GF2X_FUNC(mul9k3_mul5b) (unsigned long *c, const unsigned long *a, const unsigned long *b, const unsigned long *d) { /* Montgomery formulae with 13 multiplications */ unsigned long ta[3], tb[3], pa[8], pb[8]; __v2di p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24; __v2di t0, t2, t4, t6, t8, t10, t12; ta[0] = a[0] ^ a[4] ; tb[0] = b[0] ^ b[4]; ta[1] = a[1] ^ a[2] ; tb[1] = b[1] ^ b[2]; ta[2] = a[3] ^ ta[0] ; tb[2] = b[3] ^ tb[0]; pa[0] = ta[1] ^ ta[2] ; pb[0] = tb[1] ^ tb[2]; pa[1] = a[2] ^ ta[2] ; pb[1] = b[2] ^ tb[2]; pa[2] = ta[0] ^ ta[1] ; pb[2] = tb[0] ^ tb[1]; pa[3] = a[1] ^ ta[2] ; pb[3] = b[1] ^ tb[2]; pa[4] = a[0] ^ a[2] ^ a[3] ; pb[4] = b[0] ^ b[2] ^ b[3]; pa[5] = a[4] ^ ta[1] ; pb[5] = b[4] ^ tb[1]; pa[6] = a[3] ^ a[4] ; pb[6] = b[3] ^ b[4]; pa[7] = a[0] ^ a[1] ; pb[7] = b[0] ^ b[1]; p0 = GF2X_FUNC(mul9k3_mul1)(pa[0], pb[0]); p2 = GF2X_FUNC(mul9k3_mul1)(pa[1], pb[1]); p4 = GF2X_FUNC(mul9k3_mul1)(pa[2], pb[2]); p6 = GF2X_FUNC(mul9k3_mul1)(pa[3], pb[3]); p8 = GF2X_FUNC(mul9k3_mul1)(pa[4], pb[4]); p10 = GF2X_FUNC(mul9k3_mul1)(pa[5], pb[5]); p12 = GF2X_FUNC(mul9k3_mul1)(pa[6], pb[6]); p14 = GF2X_FUNC(mul9k3_mul1)(pa[7], pb[7]); p16 = GF2X_FUNC(mul9k3_mul1)(ta[0], tb[0]); /* p18 = GF2X_FUNC(mul9k3_mul1)(a[4], b[4]); */ p18 = _mm_loadu_si128((__v2di *) d); p20 = GF2X_FUNC(mul9k3_mul1)(a[3], b[3]); p22 = GF2X_FUNC(mul9k3_mul1)(a[1], b[1]); p24 = GF2X_FUNC(mul9k3_mul1)(a[0], b[0]); t0 = p14 ^ p24; t2 = p12 ^ p18; t4 = p2 ^ p16; t6 = p0 ^ p6; t8 = p4 ^ p16; t10 = p10 ^ t0; t12 = p8 ^ t2; __v2di ce0 = p24; __v2di ce2 = p18 ^ t8 ^ t10; __v2di ce4 = p0 ^ p20 ^ p22 ^ t10 ^ t12; __v2di ce6 = p24 ^ t4 ^ t12; __v2di ce8 = p18; __v2di co1 = p22 ^ t0; __v2di co3 = t2 ^ t4 ^ t6; __v2di co5 = t0 ^ t6 ^ t8; __v2di co7 = p20 ^ t2; _mm_storeu_si128((__v2di*)(c), ce0 ^ _mm_slli_si128(co1, 8)); _mm_storeu_si128((__v2di*)(c+2), ce2 ^ _mm_srli_si128(co1, 8) ^ _mm_slli_si128(co3, 8)); _mm_storeu_si128((__v2di*)(c+4), ce4 ^ _mm_srli_si128(co3, 8) ^ _mm_slli_si128(co5, 8)); _mm_storeu_si128((__v2di*)(c+6), ce6 ^ _mm_srli_si128(co5, 8) ^ _mm_slli_si128(co7, 8)); _mm_storeu_si128((__v2di*)(c+8), ce8 ^ _mm_srli_si128(co7, 8)); }
GF2X_STORAGE_CLASS_mul5 void gf2x_mul5 (unsigned long *c, const unsigned long *a, const unsigned long *b) { /* Montgomery formulae with 13 multiplications, see Five, Six, and Seven-Term {K}aratsuba-Like Formulae, IEEE Transactions on Computers, volume 54, number 3, p. 362-369, 2005 */ unsigned long ta[3], tb[3], pa[8], pb[8]; __v2di p0, p2, p4, p6, p8, p10, p12, p14, p16, p18, p20, p22, p24; __v2di t0, t2, t4, t6, t8, t10, t12; ta[0] = a[0] ^ a[4] ; tb[0] = b[0] ^ b[4]; ta[1] = a[1] ^ a[2] ; tb[1] = b[1] ^ b[2]; ta[2] = a[3] ^ ta[0] ; tb[2] = b[3] ^ tb[0]; pa[0] = ta[1] ^ ta[2] ; pb[0] = tb[1] ^ tb[2]; pa[1] = a[2] ^ ta[2] ; pb[1] = b[2] ^ tb[2]; pa[2] = ta[0] ^ ta[1] ; pb[2] = tb[0] ^ tb[1]; pa[3] = a[1] ^ ta[2] ; pb[3] = b[1] ^ tb[2]; pa[4] = a[0] ^ a[2] ^ a[3] ; pb[4] = b[0] ^ b[2] ^ b[3]; pa[5] = a[4] ^ ta[1] ; pb[5] = b[4] ^ tb[1]; pa[6] = a[3] ^ a[4] ; pb[6] = b[3] ^ b[4]; pa[7] = a[0] ^ a[1] ; pb[7] = b[0] ^ b[1]; p0 = GF2X_FUNC(mul5clk_c_mul1)(pa[0], pb[0]); p2 = GF2X_FUNC(mul5clk_c_mul1)(pa[1], pb[1]); p4 = GF2X_FUNC(mul5clk_c_mul1)(pa[2], pb[2]); p6 = GF2X_FUNC(mul5clk_c_mul1)(pa[3], pb[3]); p8 = GF2X_FUNC(mul5clk_c_mul1)(pa[4], pb[4]); p10 = GF2X_FUNC(mul5clk_c_mul1)(pa[5], pb[5]); p12 = GF2X_FUNC(mul5clk_c_mul1)(pa[6], pb[6]); p14 = GF2X_FUNC(mul5clk_c_mul1)(pa[7], pb[7]); p16 = GF2X_FUNC(mul5clk_c_mul1)(ta[0], tb[0]); p18 = GF2X_FUNC(mul5clk_c_mul1)(a[4], b[4]); p20 = GF2X_FUNC(mul5clk_c_mul1)(a[3], b[3]); p22 = GF2X_FUNC(mul5clk_c_mul1)(a[1], b[1]); p24 = GF2X_FUNC(mul5clk_c_mul1)(a[0], b[0]); t0 = p14 ^ p24; t2 = p12 ^ p18; t4 = p2 ^ p16; t6 = p0 ^ p6; t8 = p4 ^ p16; t10 = p10 ^ t0; t12 = p8 ^ t2; __v2di ce0 = p24; __v2di ce2 = p18 ^ t8 ^ t10; __v2di ce4 = p0 ^ p20 ^ p22 ^ t10 ^ t12; __v2di ce6 = p24 ^ t4 ^ t12; __v2di ce8 = p18; __v2di co1 = p22 ^ t0; __v2di co3 = t2 ^ t4 ^ t6; __v2di co5 = t0 ^ t6 ^ t8; __v2di co7 = p20 ^ t2; _mm_storeu_si128((__v2di*)(c), ce0 ^ _mm_slli_si128(co1, 8)); _mm_storeu_si128((__v2di*)(c+2), ce2 ^ _mm_srli_si128(co1, 8) ^ _mm_slli_si128(co3, 8)); _mm_storeu_si128((__v2di*)(c+4), ce4 ^ _mm_srli_si128(co3, 8) ^ _mm_slli_si128(co5, 8)); _mm_storeu_si128((__v2di*)(c+6), ce6 ^ _mm_srli_si128(co5, 8) ^ _mm_slli_si128(co7, 8)); _mm_storeu_si128((__v2di*)(c+8), ce8 ^ _mm_srli_si128(co7, 8)); }
void Polyval_Horner(unsigned char* TAG, unsigned char* pH, unsigned char* inp, int length) { __m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H; int i=0; if (length==0) return; int has_semi = length%16; uint8_t B[16]={0}; length /=16; H = _mm_loadu_si128(((__m128i*)pH)); T = _mm_loadu_si128(((__m128i*)TAG)); POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); for (i=0; i< length; i++) { T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i])); TMP1 = _mm_clmulepi64_si128(T, H, 0x00); TMP4 = _mm_clmulepi64_si128(T, H, 0x11); TMP2 = _mm_clmulepi64_si128(T, H, 0x10); TMP3 = _mm_clmulepi64_si128(T, H, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); } if (has_semi!=0) { memcpy(B, inp+length*16, has_semi); T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B)); TMP1 = _mm_clmulepi64_si128(T, H, 0x00); TMP4 = _mm_clmulepi64_si128(T, H, 0x11); TMP2 = _mm_clmulepi64_si128(T, H, 0x10); TMP3 = _mm_clmulepi64_si128(T, H, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); } _mm_storeu_si128(((__m128i*)TAG), T); }
__m128i shift_right_sse1(__m128i vec, int shift_num) { if(shift_num == 8) return _mm_slli_si128(vec, 1); __m128i carryover = _mm_slli_si128(vec, 1); carryover = _mm_srli_epi64(carryover, 8 - (shift_num % 8)); vec = _mm_slli_epi64(vec, shift_num % 8); return _mm_or_si128(vec, carryover); }
static __m128i aes128_keyexpand(__m128i key, __m128i keygened, int shuf) { key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); keygened = _mm_shuffle_epi32(keygened, shuf); return _mm_xor_si128(key, keygened); }
static __m128i assist128(__m128i a, __m128i b) { __m128i tmp = _mm_slli_si128 (a, 0x04); a = _mm_xor_si128 (a, tmp); tmp = _mm_slli_si128 (tmp, 0x04); a = _mm_xor_si128 (_mm_xor_si128 (a, tmp), _mm_slli_si128 (tmp, 0x04)); return _mm_xor_si128 (a, _mm_shuffle_epi32 (b ,0xff)); }
__m128i aes128_keyexpand(__m128i key, __m128i keygened) { key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); keygened = _mm_shuffle_epi32(keygened, _MM_SHUFFLE(3,3,3,3)); return _mm_xor_si128(key, keygened); }
void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_srli_si128(A1, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_srli_si128(A1, 4); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i L0 = _mm_load_si128((const __m128i *)left); const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); __m128i row_0 = avg3_0; __m128i row_1 = avg3_1; __m128i avg2_avg3_left[2][2]; int i, j; (void)bd; avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); for (j = 0; j < 2; ++j) { for (i = 0; i < 2; ++i) { const __m128i avg2_avg3 = avg2_avg3_left[j][i]; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; } } }
void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); const __m128i B0 = _mm_load_si128((const __m128i *)above); const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); const __m128i L0 = _mm_load_si128((const __m128i *)left); const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); const __m128i C3 = _mm_srli_si128(B3, 2); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); __m128i rowa_0 = avg3_0; __m128i rowa_1 = avg3_1; __m128i rowa_2 = avg3_2; __m128i rowa_3 = avg3_3; __m128i avg3_left[4]; int i, j; (void)bd; avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); for (i = 0; i < 4; ++i) { __m128i avg_left = avg3_left[i]; for (j = 0; j < 8; ++j) { rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); _mm_store_si128((__m128i *)dst, rowa_0); _mm_store_si128((__m128i *)(dst + 8), rowa_1); _mm_store_si128((__m128i *)(dst + 16), rowa_2); _mm_store_si128((__m128i *)(dst + 24), rowa_3); dst += stride; } } }
void png_read_filter_row_avg4_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; __m128i* rp = (__m128i*)row; const __m128i* prp = (const __m128i*)prev_row; __m128i pixel = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi8(0x01); for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--) { __m128i prb = _mm_load_si128(prp++); __m128i rb = _mm_load_si128(rp); // First pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 4); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif // Second pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 4); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif // Third pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 4); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif // Fourth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif _mm_store_si128(rp++, rb); } }
__m128i aes_256_key_expansion(__m128i key, __m128i key2) { __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); return _mm_xor_si128(key, key_with_rcon); }
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2) { __m128i tmp4; *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF); tmp4 = _mm_slli_si128(*tmp1, 0x04); *tmp1 = _mm_xor_si128(*tmp1, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x04); *tmp1 = _mm_xor_si128(*tmp1, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x04); *tmp1 = _mm_xor_si128(*tmp1, tmp4); *tmp1 = _mm_xor_si128(*tmp1, *tmp2); }
static inline void KEY_256_ASSIST_1(__m128i* temp1, __m128i * temp2) { __m128i temp4; *temp2 = _mm_shuffle_epi32(*temp2, 0xff); temp4 = _mm_slli_si128 (*temp1, 0x4); *temp1 = _mm_xor_si128 (*temp1, temp4); temp4 = _mm_slli_si128 (temp4, 0x4); *temp1 = _mm_xor_si128 (*temp1, temp4); temp4 = _mm_slli_si128 (temp4, 0x4); *temp1 = _mm_xor_si128 (*temp1, temp4); *temp1 = _mm_xor_si128 (*temp1, *temp2); }
inline __m128i aesni_128_assist(__m128i t1, __m128i t2) { __m128i t3; t2 = _mm_shuffle_epi32(t2 ,0xff); t3 = _mm_slli_si128(t1, 0x4); t1 = _mm_xor_si128(t1, t3); t3 = _mm_slli_si128(t3, 0x4); t1 = _mm_xor_si128(t1, t3); t3 = _mm_slli_si128(t3, 0x4); t1 = _mm_xor_si128(t1, t3); t1 = _mm_xor_si128(t1, t2); return t1; }
static inline void KEY_256_ASSIST_2(__m128i* temp1, __m128i * temp3) { __m128i temp2,temp4; temp4 = _mm_aeskeygenassist_si128 (*temp1, 0x0); temp2 = _mm_shuffle_epi32(temp4, 0xaa); temp4 = _mm_slli_si128 (*temp3, 0x4); *temp3 = _mm_xor_si128 (*temp3, temp4); temp4 = _mm_slli_si128 (temp4, 0x4); *temp3 = _mm_xor_si128 (*temp3, temp4); temp4 = _mm_slli_si128 (temp4, 0x4); *temp3 = _mm_xor_si128 (*temp3, temp4); *temp3 = _mm_xor_si128 (*temp3, temp2); }
static __m128i aes_keygen_assist(__m128i temp1, __m128i temp2) { __m128i temp3; temp2 = _mm_shuffle_epi32(temp2, 0xff); temp3 = _mm_slli_si128(temp1, 0x4); temp1 = vxor(temp1, temp3); temp3 = _mm_slli_si128(temp3, 0x4); temp1 = vxor(temp1, temp3); temp3 = _mm_slli_si128(temp3, 0x4); temp1 = vxor(temp1, temp3); temp1 = vxor(temp1, temp2); return temp1; }
inline __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { __m128i temp3; temp2 = _mm_shuffle_epi32 (temp2 ,0xff); temp3 = _mm_slli_si128 (temp1, 0x4); temp1 = _mm_xor_si128 (temp1, temp3); temp3 = _mm_slli_si128 (temp3, 0x4); temp1 = _mm_xor_si128 (temp1, temp3); temp3 = _mm_slli_si128 (temp3, 0x4); temp1 = _mm_xor_si128 (temp1, temp3); temp1 = _mm_xor_si128 (temp1, temp2); return temp1; }
/** Performs a carryless multiplication of two 128bit integers modulo \f$ x^{128} + x^7 + x^2 + x + 1 \f$ */ static __m128i gmul(__m128i v, __m128i h) { /* multiply */ __m128i z0, z1, z2, tmp; z0 = _mm_clmulepi64_si128(v, h, 0x11); z2 = _mm_clmulepi64_si128(v, h, 0x00); __m128i tmpv = _mm_srli_si128(v, 8); tmpv = _mm_xor_si128(tmpv, v); __m128i tmph = _mm_srli_si128(h, 8); tmph = _mm_xor_si128(tmph, h); z1 = _mm_clmulepi64_si128(tmpv, tmph, 0x00); z1 = _mm_xor_si128(z1, z0); z1 = _mm_xor_si128(z1, z2); tmp = _mm_srli_si128(z1, 8); __m128i pl = _mm_xor_si128(z0, tmp); tmp = _mm_slli_si128(z1, 8); __m128i ph = _mm_xor_si128(z2, tmp); tmp = _mm_srli_epi64(ph, 63); tmp = _mm_srli_si128(tmp, 8); pl = shl(pl, 1); pl = _mm_xor_si128(pl, tmp); ph = shl(ph, 1); /* reduce */ __m128i b, c; b = c = _mm_slli_si128(ph, 8); b = _mm_slli_epi64(b, 62); c = _mm_slli_epi64(c, 57); tmp = _mm_xor_si128(b, c); __m128i d = _mm_xor_si128(ph, tmp); __m128i e = shr(d, 1); __m128i f = shr(d, 2); __m128i g = shr(d, 7); pl = _mm_xor_si128(pl, d); pl = _mm_xor_si128(pl, e); pl = _mm_xor_si128(pl, f); pl = _mm_xor_si128(pl, g); return pl; }
void Cryptor::assistKey256_1(__m128i *tmp, __m128i *tmp2) { // Duplicate 4th part 4 times. *tmp2 = _mm_shuffle_epi32(*tmp2, SHUFFLE4_32(3, 3, 3, 3)); __m128i tmp3 = _mm_slli_si128(*tmp, 0x4); *tmp = _mm_xor_si128(*tmp, tmp3); tmp3 = _mm_slli_si128(tmp3, 0x4); *tmp = _mm_xor_si128(*tmp, tmp3); tmp3 = _mm_slli_si128(tmp3, 0x4); *tmp = _mm_xor_si128(*tmp, tmp3); *tmp = _mm_xor_si128(*tmp, *tmp2); }
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3) { __m128i tmp2, tmp4; tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00); tmp2 = _mm_shuffle_epi32(tmp4, 0xAA); tmp4 = _mm_slli_si128(*tmp3, 0x04); *tmp3 = _mm_xor_si128(*tmp3, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x04); *tmp3 = _mm_xor_si128(*tmp3, tmp4); tmp4 = _mm_slli_si128(tmp4, 0x04); *tmp3 = _mm_xor_si128(*tmp3, tmp4); *tmp3 = _mm_xor_si128(*tmp3, tmp2); }
uint64_t siphash13(const unsigned char key[16], const unsigned char *m, size_t len) { xmmi k,v02,v20,v13,v11,v33,mi; uint64_t last7; uint32_t lo, hi; size_t i, blocks; k = _mm_loadu_si128((xmmi *)(key + 0)); v02 = siphash_init[0].v; v13 = siphash_init[1].v; v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); last7 = (uint64_t)(len & 0xff) << 56; for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { mi = _mm_loadl_epi64((xmmi *)(m + i)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() v02 = _mm_xor_si128(v02, mi); } switch (len - blocks) { case 7: last7 |= (uint64_t)m[i + 6] << 48; case 6: last7 |= (uint64_t)m[i + 5] << 40; case 5: last7 |= (uint64_t)m[i + 4] << 32; case 4: last7 |= (uint64_t)m[i + 3] << 24; case 3: last7 |= (uint64_t)m[i + 2] << 16; case 2: last7 |= (uint64_t)m[i + 1] << 8; case 1: last7 |= (uint64_t)m[i + 0] ; case 0: default:; }; mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32))); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() v02 = _mm_xor_si128(v02, mi); v02 = _mm_xor_si128(v02, siphash_final.v); sipcompress() sipcompress() sipcompress() v02 = _mm_xor_si128(v02, v13); v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); lo = _mm_cvtsi128_si32(v02); hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4)); return ((uint64_t)hi << 32) | lo; }
template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) { assert(width >= HA); if(align) { assert(Aligned(blue) && Aligned(blueStride)); assert(Aligned(green) && Aligned(greenStride)); assert(Aligned(red) && Aligned(redStride)); assert(Aligned(bgra) && Aligned(bgraStride)); } __m128i _alpha = _mm_slli_si128(_mm_set1_epi16(alpha), 1); size_t alignedWidth = AlignLo(width, HA); for(size_t row = 0; row < height; ++row) { for(size_t col = 0, srcOffset = 0, dstOffset = 0; col < alignedWidth; col += HA, srcOffset += A, dstOffset += DA) Bgr48pToBgra32<align>(bgra + dstOffset, blue, green, red, srcOffset, _alpha); if(width != alignedWidth) Bgr48pToBgra32<false>(bgra + (width - HA)*4, blue, green, red, (width - HA)*2, _alpha); blue += blueStride; green += greenStride; red += redStride; bgra += bgraStride; } }
inline void single_compute_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { __m128i r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum); if(rot != 0) r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot)); out = _mm_xor_si128(out, r); }