void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) { // DAG-LABEL: test_mm_maskmoveu_si128 // DAG: call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8* %{{.*}}) // // ASM-LABEL: test_mm_maskmoveu_si128 // ASM: maskmovdqu _mm_maskmoveu_si128(A, B, C); }
void filterScanlinesSSE( unsigned char* filtered, unsigned char* image, unsigned int WIDTH, unsigned int HEIGHT ) { int blocks = 3*WIDTH/16; // Create move-mask for last block of each scanline __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ), _mm_set1_epi8( 3*WIDTH-16*blocks ) ); { const unsigned char* in = image; unsigned char* out = filtered; *out++ = 0; for(int b=0; b<blocks; b++ ) { _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } for( unsigned int j=1; j<HEIGHT; j++ ) { const unsigned char* in = image + 3*WIDTH*(j-1); unsigned char* out = filtered + (3*WIDTH+1)*j; *out++ = 2; for(int b=0; b<blocks; b++ ) { __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in ); __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) ); _mm_storeu_si128( (__m128i*)out, _mm_sub_epi8( _t1, _t0 ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } }
void replace_alpha_rgba8_sse2(const Uint8 alpha, const Uint32 size, Uint8* source) { __m128i t0; Uint32 i; t0 = _mm_set1_epi8(alpha); for (i = 0; i < (size / 4); i++) { _mm_maskmoveu_si128(t0, _mm_set1_epi32(0xFF000000), (char*)&source[i * 16]); } }
void replace_a8_rgba8_sse2(const Uint8* alpha, const Uint32 size, Uint8* source) { __m128i t0; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0); t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0); _mm_maskmoveu_si128(t0, _mm_set1_epi32(0xFF000000), (char*)&source[i * 16]); } }
void static TEST (void) { __m128i src, mask; char s[16] = { 1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16 }; char m[16]; char u[20] = { 0 }; int i; for (i = 0; i < 16; i++) m[i] = mask_v (i); src = _mm_loadu_si128 ((__m128i *)s); mask = _mm_loadu_si128 ((__m128i *)m); _mm_maskmoveu_si128 (src, mask, u+3); for (i = 0; i < 16; i++) if (u[i+3] != (m[i] ? s[i] : 0)) abort (); }
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base) { const signed char* src_data = (signed char*)(src->imageData + base); unsigned char * dst_data = (unsigned char*)(dst->imageData + base); const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep; __m128i pixels[3]; // Load first two rows //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit src_data += src->widthStep; //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q); src_data += src->widthStep; int phase = 2; __m128i * phase_map[3][3] = { {pixels+1, pixels+2, pixels}, {pixels+2, pixels, pixels+1}, {pixels, pixels+1, pixels+2}, }; while (src_data < src_end) { register __m128i weight = ones.q; register __m128i code = _mm_setzero_si128(); //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q); //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q); pixels[phase] = _mm_lddqu_si128((__m128i*)src_data); src_data += src->widthStep; dst_data += dst->widthStep; _mm_prefetch(src_data, _MM_HINT_T0); register __m128i a = *(phase_map[phase][0]); register __m128i b = *(phase_map[phase][1]); register __m128i c = *(phase_map[phase][2]); phase++; phase = (phase == 3) ? 0 : phase; // X . . A // . o . B // . . . C code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . X . // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight)); weight = _mm_slli_epi64(weight, 1); // . . X // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . X // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . . X code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . X . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // X . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // X . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write } }
int main(int, char**) { __m128i a = _mm_setzero_si128(); _mm_maskmoveu_si128(a, _mm_setzero_si128(), 0); return 0; }
int haraka512256(unsigned char *hash, const unsigned char *msg) { // stuff we need int i, j; __m128i s[4], tmp, rcon; __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0); // set initial round constant rcon = _mm_set_epi32(1,1,1,1); // initialize state to msg s[0] = _mm_load_si128(&((__m128i*)msg)[0]); s[1] = _mm_load_si128(&((__m128i*)msg)[1]); s[2] = _mm_load_si128(&((__m128i*)msg)[2]); s[3] = _mm_load_si128(&((__m128i*)msg)[3]); //printf("= input state =\n"); //printstate512(s[0], s[1], s[2], s[3]); for (i = 0; i < ROUNDS; ++i) { // aes round(s) for (j = 0; j < AES_PER_ROUND; ++j) { s[0] = _mm_aesenc_si128(s[0], rcon); s[1] = _mm_aesenc_si128(s[1], rcon); s[2] = _mm_aesenc_si128(s[2], rcon); s[3] = _mm_aesenc_si128(s[3], rcon); rcon = _mm_slli_epi32(rcon, 1); } //printf("= round %d : after aes layer =\n", i); //printstate512(s[0], s[1], s[2], s[3]); // mixing tmp = _mm_unpacklo_epi32(s[0], s[1]); s[0] = _mm_unpackhi_epi32(s[0], s[1]); s[1] = _mm_unpacklo_epi32(s[2], s[3]); s[2] = _mm_unpackhi_epi32(s[2], s[3]); s[3] = _mm_unpacklo_epi32(s[0], s[2]); s[0] = _mm_unpackhi_epi32(s[0], s[2]); s[2] = _mm_unpackhi_epi32(s[1], tmp); s[1] = _mm_unpacklo_epi32(s[1], tmp); //printf("= round %d : after mix layer =\n", i); //printstate512(s[0], s[1], s[2], s[3]); // little-endian mixing (not used) // tmp = _mm_unpackhi_epi32(s[1], s[0]); // s[0] = _mm_unpacklo_epi32(s[1], s[0]); // s[1] = _mm_unpackhi_epi32(s[3], s[2]); // s[2] = _mm_unpacklo_epi32(s[3], s[2]); // s[3] = _mm_unpackhi_epi32(s[2], s[0]); // s[0] = _mm_unpacklo_epi32(s[2], s[0]); // s[2] = _mm_unpacklo_epi32(tmp, s[1]); // s[1] = _mm_unpackhi_epi32(tmp, s[1]); } //printf("= output from permutation =\n"); //printstate512(s[0], s[1], s[2], s[3]); // xor message to get DM effect s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2])); s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3])); //printf("= after feed-forward =\n"); //printstate512(s[0], s[1], s[2], s[3]); // truncate and store result _mm_maskmoveu_si128(s[0], MSB64, (hash-8)); _mm_maskmoveu_si128(s[1], MSB64, (hash+0)); _mm_storel_epi64((__m128i*)(hash + 16), s[2]); _mm_storel_epi64((__m128i*)(hash + 24), s[3]); }
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }