Exemplo n.º 1
0
void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
  // DAG-LABEL: test_mm_maskmoveu_si128
  // DAG: call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8* %{{.*}})
  //
  // ASM-LABEL: test_mm_maskmoveu_si128
  // ASM: maskmovdqu
  _mm_maskmoveu_si128(A, B, C);
}
Exemplo n.º 2
0
void
filterScanlinesSSE( unsigned char* filtered,
                    unsigned char* image,
                    unsigned int WIDTH,
                    unsigned int HEIGHT )
{
    int blocks = 3*WIDTH/16;

    // Create move-mask for last block of each scanline
    __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8,
                                                  7,  6,  5,  4,  3,  2, 1, 0 ),
                                   _mm_set1_epi8( 3*WIDTH-16*blocks ) );
    {
        const unsigned char* in = image;
        unsigned char* out = filtered;
        *out++ = 0;
        for(int b=0; b<blocks; b++ ) {
            _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) );
            in += 16;
            out += 16;
        }
        _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out );
    }

    for( unsigned int j=1; j<HEIGHT; j++ ) {
        const unsigned char* in = image + 3*WIDTH*(j-1);
        unsigned char* out = filtered + (3*WIDTH+1)*j;
        *out++ = 2;
        for(int b=0; b<blocks; b++ ) {
            __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in );
            __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) );

            _mm_storeu_si128( (__m128i*)out,
                              _mm_sub_epi8( _t1, _t0 ) );
            in += 16;
            out += 16;
        }
        _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ),
                             mask,
                             (char*)out );

    }
}
Exemplo n.º 3
0
void replace_alpha_rgba8_sse2(const Uint8 alpha, const Uint32 size, Uint8* source)
{
	__m128i t0;
	Uint32 i;

	t0 = _mm_set1_epi8(alpha);

	for (i = 0; i < (size / 4); i++)
	{
		_mm_maskmoveu_si128(t0, _mm_set1_epi32(0xFF000000),
			(char*)&source[i * 16]);
	}
}
Exemplo n.º 4
0
void replace_a8_rgba8_sse2(const Uint8* alpha, const Uint32 size, Uint8* source)
{
	__m128i t0;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0);
		t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0);

		_mm_maskmoveu_si128(t0, _mm_set1_epi32(0xFF000000),
			(char*)&source[i * 16]);
	}
}
Exemplo n.º 5
0
void static
TEST (void)
{
    __m128i src, mask;
    char s[16] = { 1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16 };
    char m[16];

    char u[20] = { 0 };
    int i;

    for (i = 0; i < 16; i++)
        m[i] = mask_v (i);

    src = _mm_loadu_si128 ((__m128i *)s);
    mask = _mm_loadu_si128 ((__m128i *)m);

    _mm_maskmoveu_si128 (src, mask, u+3);

    for (i = 0; i < 16; i++)
        if (u[i+3] != (m[i] ? s[i] : 0))
            abort ();
}
Exemplo n.º 6
0
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base)
{
    const signed char* src_data = (signed char*)(src->imageData + base);
    unsigned char * dst_data = (unsigned char*)(dst->imageData + base);
    const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep;
   
    __m128i pixels[3];

    // Load first two rows
    //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit
    src_data += src->widthStep;
    //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q);
    src_data += src->widthStep;

    int phase = 2;

    __m128i * phase_map[3][3] = {
        {pixels+1, pixels+2, pixels},
        {pixels+2, pixels, pixels+1},
        {pixels, pixels+1, pixels+2},
    };

    while (src_data < src_end)
    {
        register __m128i weight = ones.q;
        register __m128i code = _mm_setzero_si128();

        //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
        //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q);
        //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q);
        pixels[phase] = _mm_lddqu_si128((__m128i*)src_data);

        src_data += src->widthStep;
        dst_data += dst->widthStep;
        
        _mm_prefetch(src_data, _MM_HINT_T0);

        register __m128i a = *(phase_map[phase][0]);
        register __m128i b = *(phase_map[phase][1]);
        register __m128i c = *(phase_map[phase][2]);

        phase++;
        phase = (phase == 3) ? 0 : phase;
        
        // X . .   A
        // . o .   B
        // . . .   C
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . X .
        // .   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . X
        // .   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . . .
        // .   X
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // .   .
        // . . X
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . . .
        // .   .
        // . X .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // .   .
        // X . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // X   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); 

        _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write
    }
}
Exemplo n.º 7
0
int main(int, char**)
{
    __m128i a = _mm_setzero_si128();
    _mm_maskmoveu_si128(a, _mm_setzero_si128(), 0);
    return 0;
}
Exemplo n.º 8
0
int haraka512256(unsigned char *hash, const unsigned char *msg) {
	// stuff we need
	int i, j;
	__m128i s[4], tmp, rcon;
	__m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0);

	// set initial round constant
	rcon = _mm_set_epi32(1,1,1,1);

	// initialize state to msg
	s[0] = _mm_load_si128(&((__m128i*)msg)[0]);
	s[1] = _mm_load_si128(&((__m128i*)msg)[1]);
	s[2] = _mm_load_si128(&((__m128i*)msg)[2]);
	s[3] = _mm_load_si128(&((__m128i*)msg)[3]);

	//printf("= input state =\n");
	//printstate512(s[0], s[1], s[2], s[3]);

	for (i = 0; i < ROUNDS; ++i) {
		// aes round(s)
		for (j = 0; j < AES_PER_ROUND; ++j) {
			s[0] = _mm_aesenc_si128(s[0], rcon);
			s[1] = _mm_aesenc_si128(s[1], rcon);
			s[2] = _mm_aesenc_si128(s[2], rcon);
			s[3] = _mm_aesenc_si128(s[3], rcon);
			rcon = _mm_slli_epi32(rcon, 1);
		}

		//printf("= round %d : after aes layer =\n", i);
		//printstate512(s[0], s[1], s[2], s[3]);
		
		// mixing
		tmp  = _mm_unpacklo_epi32(s[0], s[1]);
		s[0] = _mm_unpackhi_epi32(s[0], s[1]);
		s[1] = _mm_unpacklo_epi32(s[2], s[3]);
		s[2] = _mm_unpackhi_epi32(s[2], s[3]);
		s[3] = _mm_unpacklo_epi32(s[0], s[2]);
		s[0] = _mm_unpackhi_epi32(s[0], s[2]);
		s[2] = _mm_unpackhi_epi32(s[1],  tmp);
		s[1] = _mm_unpacklo_epi32(s[1],  tmp);

		//printf("= round %d : after mix layer =\n", i);
		//printstate512(s[0], s[1], s[2], s[3]);

		// little-endian mixing (not used)
		// tmp  = _mm_unpackhi_epi32(s[1], s[0]);
		// s[0] = _mm_unpacklo_epi32(s[1], s[0]);
		// s[1] = _mm_unpackhi_epi32(s[3], s[2]);
		// s[2] = _mm_unpacklo_epi32(s[3], s[2]);
		// s[3] = _mm_unpackhi_epi32(s[2], s[0]);
		// s[0] = _mm_unpacklo_epi32(s[2], s[0]);
		// s[2] = _mm_unpacklo_epi32(tmp,  s[1]);
		// s[1] = _mm_unpackhi_epi32(tmp,  s[1]);
	}

	//printf("= output from permutation =\n");
	//printstate512(s[0], s[1], s[2], s[3]);

	// xor message to get DM effect
	s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0]));
	s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1]));
	s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2]));
	s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3]));

	//printf("= after feed-forward =\n");
	//printstate512(s[0], s[1], s[2], s[3]);

	// truncate and store result
	_mm_maskmoveu_si128(s[0], MSB64, (hash-8));
	_mm_maskmoveu_si128(s[1], MSB64, (hash+0));
	_mm_storel_epi64((__m128i*)(hash + 16), s[2]);
	_mm_storel_epi64((__m128i*)(hash + 24), s[3]);
}
Exemplo n.º 9
0
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p == val) *p = substitute;

	// body, SSE2
	const __m128i mask = _mm_set1_epi8(val);
	const __m128i sub  = _mm_set1_epi8(substitute);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
		{
			_mm_store_si128((__m128i *)p,
				_mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v)));
		}
		n -= 16; p += 16;
	}

	const __m256i mask2 = _mm256_set1_epi8(val);
	const __m256i sub32 = _mm256_set1_epi8(substitute);
	const __m256i zero = _mm256_setzero_si256();
	const __m256i ones = _mm256_cmpeq_epi64(zero, zero);

	for (; n >= 32; n-=32, p+=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p);
		__m256i c = _mm256_cmpeq_epi8(v, mask2);
		if (_mm256_movemask_epi8(c))
		{
			// TODO
			_mm256_store_si256((__m256i *)p,
				_mm256_or_si256(_mm256_and_si256(c, sub32),
				_mm256_andnot_si256(c, v)));
		}
	}

#   endif

	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
			_mm_maskmoveu_si128(sub, c, (char*)p);
	}

#endif

	// tail
	for (; n > 0; n--, p++)
		if (*p == val) *p = substitute;
}