Ejemplo n.º 1
0
inline static void sse2_memset128aligned(void *ptr, int n, stm_word_t word)
{
#ifdef __LP64__
    __m128d d = (__m128d)_mm_set_epi64((__m64)word, (__m64)word);
#else    
    __m128d d = (__m128d)_mm_set_epi32(word, word, word, word);
#endif
    assert(((stm_word_t)ptr)%16==0);
    assert(n%128==0);
    
    char *p, *endptr = ((char*)ptr)+n; // = ptr;
    for(p = ptr; p < endptr; p+=128) {
	_mm_stream_pd((double*)&p[0], d);
	_mm_stream_pd((double*)&p[16], d);
	_mm_stream_pd((double*)&p[32], d);
	_mm_stream_pd((double*)&p[48], d);
	_mm_stream_pd((double*)&p[64], d);
	_mm_stream_pd((double*)&p[80], d);
	_mm_stream_pd((double*)&p[96], d);
	_mm_stream_pd((double*)&p[112], d);
    }
    _mm_sfence();
}
TARGET_SSE4_1 static inline __m128i extract(__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3)
{
	__m64 mm0, mm1;

	mm0 = _mm_cvtsi64_m64(Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 8))]);
	mm0 = _mm_xor_64(mm0, Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 8))]);
	mm0 = _mm_xor_64(mm0, Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 8))]);
	mm0 = _mm_xor_64(mm0, Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 0))]);
	mm0 = _mm_xor_64(mm0, Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 8))]);

	mm1 = _mm_cvtsi64_m64(Ax[0][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[1][static_cast<quint8>(_mm_extract_epi8(xmm0, row + 9))]);
	mm1 = _mm_xor_64(mm1, Ax[2][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[3][static_cast<quint8>(_mm_extract_epi8(xmm1, row + 9))]);
	mm1 = _mm_xor_64(mm1, Ax[4][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[5][static_cast<quint8>(_mm_extract_epi8(xmm2, row + 9))]);
	mm1 = _mm_xor_64(mm1, Ax[6][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 1))]);
	mm1 = _mm_xor_64(mm1, Ax[7][static_cast<quint8>(_mm_extract_epi8(xmm3, row + 9))]);

	return _mm_set_epi64(mm1, mm0);
}
Ejemplo n.º 3
0
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base)
{
    const signed char* src_data = (signed char*)(src->imageData + base);
    unsigned char * dst_data = (unsigned char*)(dst->imageData + base);
    const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep;
   
    __m128i pixels[3];

    // Load first two rows
    //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit
    src_data += src->widthStep;
    //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q);
    src_data += src->widthStep;

    int phase = 2;

    __m128i * phase_map[3][3] = {
        {pixels+1, pixels+2, pixels},
        {pixels+2, pixels, pixels+1},
        {pixels, pixels+1, pixels+2},
    };

    while (src_data < src_end)
    {
        register __m128i weight = ones.q;
        register __m128i code = _mm_setzero_si128();

        //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
        //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q);
        //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q);
        pixels[phase] = _mm_lddqu_si128((__m128i*)src_data);

        src_data += src->widthStep;
        dst_data += dst->widthStep;
        
        _mm_prefetch(src_data, _MM_HINT_T0);

        register __m128i a = *(phase_map[phase][0]);
        register __m128i b = *(phase_map[phase][1]);
        register __m128i c = *(phase_map[phase][2]);

        phase++;
        phase = (phase == 3) ? 0 : phase;
        
        // X . .   A
        // . o .   B
        // . . .   C
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . X .
        // .   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . X
        // .   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . . .
        // .   X
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // .   .
        // . . X
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . . .
        // .   .
        // . X .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // .   .
        // X . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // X   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); 

        _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write
    }
}
    int64_t similar = NEG_LIMIT;
    int64_t length = NEG_LIMIT;
    vec128i vNegLimit = _mm_set1_epi64(NEG_LIMIT);
    vec128i vPosLimit = _mm_set1_epi64(POS_LIMIT);
    vec128i vSaturationCheckMin = vPosLimit;
    vec128i vSaturationCheckMax = vNegLimit;
    vec128i vNegInf = _mm_set1_epi64(NEG_LIMIT);
    vec128i vOpen = _mm_set1_epi64(open);
    vec128i vGap  = _mm_set1_epi64(gap);
    vec128i vZero = _mm_set1_epi64(0);
    vec128i vNegInf0 = _mm_insert_epi64(vZero, NEG_LIMIT, 1);
    vec128i vOne = _mm_set1_epi64(1);
    vec128i vN = _mm_set1_epi64(N);
    vec128i vGapN = _mm_set1_epi64(gap*N);
    vec128i vNegOne = _mm_set1_epi64(-1);
    vec128i vI = _mm_set_epi64(0,1);
    vec128i vJreset = _mm_set_epi64(0,-1);
    vec128i vMaxH = vNegInf;
    vec128i vMaxM = vNegInf;
    vec128i vMaxS = vNegInf;
    vec128i vMaxL = vNegInf;
    vec128i vILimit = _mm_set1_epi64(s1Len);
    vec128i vILimit1 = _mm_sub_epi64(vILimit, vOne);
    vec128i vJLimit = _mm_set1_epi64(s2Len);
    vec128i vJLimit1 = _mm_sub_epi64(vJLimit, vOne);
    vec128i vIBoundary = _mm_set_epi64(
            -open-0*gap,
            -open-1*gap);

    /* convert _s1 from char to int in range 0-23 */
    for (i=0; i<s1Len; ++i) {
Ejemplo n.º 5
0
static  int blake64_compress( state * state, const u8 * datablock ) {

  __m128i row1a,row1b;
  __m128i row2a,row2b;
  __m128i row3a,row3b;
  __m128i row4a,row4b;
  __m128i buf1a,buf2a;
  static const u8 rot16[16] = {2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9};
  __m128i r16 = _mm_load_si128((__m128i*)rot16);


  u64 m[16]; 
  u64 y[16]; 

  /* constants and permutation */
  static const int sig[][16] = {
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 }, 
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
  };

  static const u64 z[16] = {
    0x243F6A8885A308D3ULL,0x13198A2E03707344ULL,
    0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL,
    0x452821E638D01377ULL,0xBE5466CF34E90C6CULL,
    0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL,
    0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL,
    0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL,
    0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL,
    0x0801F2E2858EFC16ULL,0x636920D871574E69ULL
  };

  /* get message */
  m[ 0] = U8TO64(datablock +  0);
  m[ 1] = U8TO64(datablock +  8);
  m[ 2] = U8TO64(datablock + 16);
  m[ 3] = U8TO64(datablock + 24);
  m[ 4] = U8TO64(datablock + 32);
  m[ 5] = U8TO64(datablock + 40);
  m[ 6] = U8TO64(datablock + 48);
  m[ 7] = U8TO64(datablock + 56);
  m[ 8] = U8TO64(datablock + 64);
  m[ 9] = U8TO64(datablock + 72);
  m[10] = U8TO64(datablock + 80);
  m[11] = U8TO64(datablock + 88);
  m[12] = U8TO64(datablock + 96);
  m[13] = U8TO64(datablock +104);
  m[14] = U8TO64(datablock +112);
  m[15] = U8TO64(datablock +120);

  row1b = _mm_set_epi64((__m64)state->h[3],(__m64)state->h[2]);
  row1a = _mm_set_epi64((__m64)state->h[1],(__m64)state->h[0]);
  row2b = _mm_set_epi64((__m64)state->h[7],(__m64)state->h[6]);
  row2a = _mm_set_epi64((__m64)state->h[5],(__m64)state->h[4]);
  row3b = _mm_set_epi64((__m64)0x082EFA98EC4E6C89ULL,
			(__m64)0xA4093822299F31D0ULL);
  row3a = _mm_set_epi64((__m64)0x13198A2E03707344ULL,
			(__m64)0x243F6A8885A308D3ULL);
  
  if (state->nullt) {
    row4b = _mm_set_epi64((__m64)0x3F84D5B5B5470917ULL,
			  (__m64)0xC0AC29B7C97C50DDULL);
    row4a = _mm_set_epi64((__m64)0xBE5466CF34E90C6CULL,
			  (__m64)0x452821E638D01377ULL);
  }
  else {
    row4b = _mm_set_epi64((__m64)(0x3F84D5B5B5470917ULL^state->t[1]),
			  (__m64)(0xC0AC29B7C97C50DDULL^state->t[1]));
    row4a = _mm_set_epi64((__m64)(0xBE5466CF34E90C6CULL^state->t[0]),
			  (__m64)(0x452821E638D01377ULL^state->t[0]));
  }
  /* initialization ok (beware of bug on Celeron and P4!) */

  

#define round(r)\
    /* column step */\
    /***************************************************/\
    /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13  */		\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 2]], (__m64)m[sig[r][ 0]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 3]], (__m64)z[sig[r][ 1]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );		\
    row4a = _mm_xor_si128( row4a, row1a );				\
    row4a = _mm_shuffle_epi32(row4a, 0xB1); \
    row3a = _mm_add_epi64( row3a, row4a );				\
    row2a = _mm_xor_si128( row2a, row3a );				\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
  									\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 3]], (__m64)m[sig[r][ 1]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 2]], (__m64)z[sig[r][ 0]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );		\
    row4a = _mm_xor_si128( row4a, row1a );				\
    row4a = _mm_shuffle_epi8(row4a, r16); \
    row3a = _mm_add_epi64( row3a, row4a );				\
    row2a = _mm_xor_si128( row2a, row3a );				\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
  									\
    /* same stuff for low-order side */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 6]], (__m64)m[sig[r][ 4]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 7]], (__m64)z[sig[r][ 5]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );				\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );		\
    row4b = _mm_xor_si128( row4b, row1b );				\
    row4b = _mm_shuffle_epi32(row4b, 0xB1); \
    row3b = _mm_add_epi64( row3b, row4b );				\
    row2b = _mm_xor_si128( row2b, row3b );				\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 7]], (__m64)m[sig[r][ 5]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 6]], (__m64)z[sig[r][ 4]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );		\
    row4b = _mm_xor_si128( row4b, row1b );				\
    row4b = _mm_shuffle_epi8(row4b, r16); \
    row3b = _mm_add_epi64( row3b, row4b );				\
    row2b = _mm_xor_si128( row2b, row3b );				\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
\
    /* shuffle */\
    _mm_store_si128( 0+ (__m128i *)y, row4a); \
    _mm_store_si128( 1+ (__m128i *)y, row4b); \
    row4a = row3a;\
    row3a = row3b;\
    row3b = row4a;\
    row4a  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );\
    row4b  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );\
    _mm_store_si128( 0+ (__m128i *)y, row2a);  \
    _mm_store_si128( 1+ (__m128i *)y, row2b);  \
    row2a  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    row2b  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    /* diagonal step */\
    /***************************************************/\
    /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13  */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][10]], (__m64)m[sig[r][ 8]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][11]], (__m64)z[sig[r][ 9]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
    row4a = _mm_xor_si128( row4a, row1a );		      \
    row4a = _mm_shuffle_epi32(row4a, 0xB1); \
    row3a = _mm_add_epi64( row3a, row4a );					\
    row2a = _mm_xor_si128( row2a, row3a );					\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][11]], (__m64)m[sig[r][ 9]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][10]], (__m64)z[sig[r][ 8]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
    row4a = _mm_xor_si128( row4a, row1a );			\
    row4a = _mm_shuffle_epi8(row4a, r16); \
    row3a = _mm_add_epi64( row3a, row4a );					\
    row2a = _mm_xor_si128( row2a, row3a );					\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
\
    /* same stuff for low-order side */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][14]], (__m64)m[sig[r][12]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][15]], (__m64)z[sig[r][13]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
    row4b = _mm_xor_si128( row4b, row1b );			\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][15]], (__m64)m[sig[r][13]] );\
    row4b = _mm_shuffle_epi32(row4b, 0xB1); \
    row3b = _mm_add_epi64( row3b, row4b );					\
    row2b = _mm_xor_si128( row2b, row3b );					\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][14]], (__m64)z[sig[r][12]] );\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
    row4b = _mm_xor_si128( row4b, row1b );			\
    row4b = _mm_shuffle_epi8(row4b, r16); \
    row3b = _mm_add_epi64( row3b, row4b );					\
    row2b = _mm_xor_si128( row2b, row3b );					\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
\
    /* shuffle back */\
    buf1a = row3a;\
    row3a = row3b;\
    row3b = buf1a;\
    _mm_store_si128( 0+ (__m128i *)y, row2a);	\
    _mm_store_si128( 1+ (__m128i *)y, row2b);  \
    row2a  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    row2b  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    _mm_store_si128( 0+ (__m128i *)y, row4a);  \
    _mm_store_si128( 1+ (__m128i *)y, row4b);  \
    row4a  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    row4b  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    							 \

  round(0);
  round(1);
  round(2);
  round(3);
  round(4);
  round(5);
  round(6);
  round(7);
  round(8);
  round(9);
  round(10);
  round(11);
  round(12);
  round(13);

  row1a = _mm_xor_si128(row3a,row1a);
  row1b = _mm_xor_si128(row3b,row1b);
  _mm_store_si128(  (__m128i *)m, row1a);
  state->h[0] ^= m[ 0]; 
  state->h[1] ^= m[ 1];    
  _mm_store_si128(  (__m128i *)m, row1b);
  state->h[2] ^= m[ 0]; 
  state->h[3] ^= m[ 1];    

  row2a = _mm_xor_si128(row4a,row2a);
  row2b = _mm_xor_si128(row4b,row2b);
  _mm_store_si128(  (__m128i *)m, row2a);
  state->h[4] ^= m[ 0];    
  state->h[5] ^= m[ 1];    
  _mm_store_si128(  (__m128i *)m, row2b);
  state->h[6] ^= m[ 0];    
  state->h[7] ^= m[ 1];    

  return 0;
}
Ejemplo n.º 6
0
static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}
Ejemplo n.º 7
0
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    int64_t score = NEG_INF;
    vec128i vNegInf = _mm_set1_epi64(NEG_INF);
    vec128i vNegInf0 = _mm_srli_si128(vNegInf, 8); /* shift in a 0 */
    vec128i vOpen = _mm_set1_epi64(open);
    vec128i vGap  = _mm_set1_epi64(gap);
    vec128i vZero = _mm_set1_epi64(0);
    vec128i vOne = _mm_set1_epi64(1);
    vec128i vN = _mm_set1_epi64(N);
    vec128i vNegOne = _mm_set1_epi64(-1);
    vec128i vI = _mm_set_epi64(0,1);
    vec128i vJreset = _mm_set_epi64(0,-1);
    vec128i vMaxH = vNegInf;
    vec128i vEndI = vNegInf;
    vec128i vEndJ = vNegInf;
    vec128i vILimit = _mm_set1_epi64(s1Len);
    vec128i vJLimit = _mm_set1_epi64(s2Len);
    

    /* convert _s1 from char to int in range 0-23 */
    for (i=0; i<s1Len; ++i) {
        s1[i] = matrix->mapper[(unsigned char)_s1[i]];
    }
    /* pad back of s1 with dummy values */
    for (i=s1Len; i<s1Len_PAD; ++i) {
        s1[i] = 0; /* point to first matrix row because we don't care */