예제 #1
0
int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16*4) {
        const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16));
        const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16));
        const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16));
        const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16));

        {
            const __m128i v   = v0;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v1;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v2;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v3;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }
    }

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
}
예제 #2
0
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16) {
        const __m128i v  = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i m  = _mm_cmplt_epi8(v, zero);
        const __m128i va = _mm_abs_epi8(v);

        // sum just positive numbers
        const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);

        // sum just negative numbers
        const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
        
        positive = _mm_add_epi32(positive, t0);
        negative = _mm_sub_epi32(negative, t1);
    }

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
}
예제 #3
0
__m128i test_mm_cmplt_epi8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_cmplt_epi8
  // DAG: icmp sgt <16 x i8>
  //
  // ASM-LABEL: test_mm_cmplt_epi8
  // ASM: pcmpgtb
  return _mm_cmplt_epi8(A, B);
}
    SIMDValue SIMDInt8x16Operation::OpLessThan(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_cmplt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
예제 #5
0
        inline __m128i load_aligned_int32(const int8_t* src)
        {
            __m128i tmp = _mm_loadl_epi64((const __m128i*)src);
#if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION
            __m128i res = _mm_cvtepi8_epi32(tmp);
#else
            __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0));
            __m128i tmp1 = _mm_unpacklo_epi8(tmp, mask);
            mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0));
            __m128i res = _mm_unpacklo_epi16(tmp1, mask);
#endif
            return res;
        }
예제 #6
0
파일: main.cpp 프로젝트: CCJY/coliru
int main()
{
	ssereg a(0xffffffff);
	ssereg b(0x00000000);

	ssereg c = _mm_cmplt_epi8(a, b);

	a.print();
	b.print();
	c.print();

    return 0;
}
    SIMDValue SIMDUint8x16Operation::OpLessThan(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue signBits = { {0x80808080,0x80808080, 0x80808080, 0x80808080} };

        // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1
        tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value);
        tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value);
        x86Result.m128i_value = _mm_cmplt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
예제 #8
0
/// any (*p > 2) is set to be 3
COREARRAY_DLL_DEFAULT void vec_u8_geno_valid(C_UInt8 *p, size_t n)
{
#if defined(COREARRAY_SIMD_SSE2)

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p > 3) *p = 3;

	const __m128i zero  = _mm_setzero_si128();
	const __m128i three = _mm_set1_epi8(3);
	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i*)p);
		__m128i mask = _mm_or_si128(_mm_cmplt_epi8(v, zero),
			_mm_cmplt_epi8(three, v));
		if (_mm_movemask_epi8(mask) > 0)
			_mm_store_si128((__m128i*)p, _mm_min_epu8(v, three));
	}

#endif

	for (; n > 0; n--, p++) if (*p > 3) *p = 3;
}
//
// If we assumed AVX512, this routine could be implemented more efficiently
// and straightforwardly with _mm_mask_storeu_epi8().
//
inline void _assembler_store_partial(__m128i *dst, int s, int p, __m128i x0, __m128i x1, __m128i x2, __m128i x3, __m128i x4, __m128i x5, __m128i x6, __m128i x7)
{
    static const __m128i r = _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);

    __m128i mask = _mm_cmplt_epi8(r, _mm_set1_epi8(p));

    x0 = _mm_and_si128(mask, x0);
    x1 = _mm_and_si128(mask, x1);
    x2 = _mm_and_si128(mask, x2);
    x3 = _mm_and_si128(mask, x3);
    x4 = _mm_and_si128(mask, x4);
    x5 = _mm_and_si128(mask, x5);
    x6 = _mm_and_si128(mask, x6);
    x7 = _mm_and_si128(mask, x7);

    __m128i y0 = _mm_loadu_si128(dst);
    __m128i y1 = _mm_loadu_si128(dst+s);
    __m128i y2 = _mm_loadu_si128(dst+2*s);
    __m128i y3 = _mm_loadu_si128(dst+3*s);
    __m128i y4 = _mm_loadu_si128(dst+4*s);
    __m128i y5 = _mm_loadu_si128(dst+5*s);
    __m128i y6 = _mm_loadu_si128(dst+6*s);
    __m128i y7 = _mm_loadu_si128(dst+7*s);
    
    y0 = _mm_andnot_si128(mask, y0);
    y1 = _mm_andnot_si128(mask, y1);
    y2 = _mm_andnot_si128(mask, y2);
    y3 = _mm_andnot_si128(mask, y3);
    y4 = _mm_andnot_si128(mask, y4);
    y5 = _mm_andnot_si128(mask, y5);
    y6 = _mm_andnot_si128(mask, y6);
    y7 = _mm_andnot_si128(mask, y7);
    
    y0 = _mm_or_si128(x0, y0);
    y1 = _mm_or_si128(x1, y1);
    y2 = _mm_or_si128(x2, y2);
    y3 = _mm_or_si128(x3, y3);
    y4 = _mm_or_si128(x4, y4);
    y5 = _mm_or_si128(x5, y5);
    y6 = _mm_or_si128(x6, y6);
    y7 = _mm_or_si128(x7, y7);

    _assembler_store_full(dst, s, y0, y1, y2, y3, y4, y5, y6, y7);
}
예제 #10
0
int
exponent_sum_square_error_sse2(uint8_t *exp0, uint8_t *exp1, int ncoefs)
{
    int i, err;
    int exp_error = 0;
    union {
        __m128i v;
        int32_t res[4];
    } ures;
    __m128i vzero = _mm_setzero_si128();
    __m128i vres = vzero;

    for (i = 0; i < (ncoefs & ~15); i+=16) {
        __m128i vexp = _mm_loadu_si128((__m128i*)&exp0[i]);
        __m128i vexp2 = _mm_loadu_si128((__m128i*)&exp1[i]);
#if 0
        //safer but needed?
        __m128i vexphi = _mm_unpackhi_epi8(vexp, vzero);
        __m128i vexp2hi = _mm_unpackhi_epi8(vexp2, vzero);
        __m128i vexplo = _mm_unpacklo_epi8(vexp, vzero);
        __m128i vexp2lo = _mm_unpacklo_epi8(vexp2, vzero);
        __m128i verrhi = _mm_sub_epi16(vexphi, vexp2hi);
        __m128i verrlo = _mm_sub_epi16(vexplo, vexp2lo);
#else
        __m128i verr = _mm_sub_epi8(vexp, vexp2);
        __m128i vsign = _mm_cmplt_epi8(verr, vzero);
        __m128i verrhi = _mm_unpackhi_epi8(verr, vsign);
        __m128i verrlo = _mm_unpacklo_epi8(verr, vsign);
#endif
        verrhi = _mm_madd_epi16(verrhi, verrhi);
        verrlo = _mm_madd_epi16(verrlo, verrlo);
        verrhi = _mm_add_epi32(verrhi, verrlo);
        vres = _mm_add_epi32(vres, verrhi);
    }
    _mm_store_si128(&ures.v, vres);
    ures.res[0]+=ures.res[1];
    ures.res[2]+=ures.res[3];
    exp_error += ures.res[0]+ures.res[2];
    for (; i < ncoefs; ++i) {
        err = exp0[i] - exp1[i];
        exp_error += (err * err);
    }
    return exp_error;
}
예제 #11
0
void
filterScanlinesSSE( unsigned char* filtered,
                    unsigned char* image,
                    unsigned int WIDTH,
                    unsigned int HEIGHT )
{
    int blocks = 3*WIDTH/16;

    // Create move-mask for last block of each scanline
    __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8,
                                                  7,  6,  5,  4,  3,  2, 1, 0 ),
                                   _mm_set1_epi8( 3*WIDTH-16*blocks ) );
    {
        const unsigned char* in = image;
        unsigned char* out = filtered;
        *out++ = 0;
        for(int b=0; b<blocks; b++ ) {
            _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) );
            in += 16;
            out += 16;
        }
        _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out );
    }

    for( unsigned int j=1; j<HEIGHT; j++ ) {
        const unsigned char* in = image + 3*WIDTH*(j-1);
        unsigned char* out = filtered + (3*WIDTH+1)*j;
        *out++ = 2;
        for(int b=0; b<blocks; b++ ) {
            __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in );
            __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) );

            _mm_storeu_si128( (__m128i*)out,
                              _mm_sub_epi8( _t1, _t0 ) );
            in += 16;
            out += 16;
        }
        _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ),
                             mask,
                             (char*)out );

    }
}
예제 #12
0
파일: art.c 프로젝트: biokoda/actordb_qdrv
static void add_child16(art_node16 *n, art_node **ref, unsigned char c, void *child) {
    if (n->n.num_children < 16) {
        __m128i cmp;

        // Compare the key to all 16 stored keys
        cmp = _mm_cmplt_epi8(_mm_set1_epi8(c),
                _mm_loadu_si128((__m128i*)n->keys));

        // Use a mask to ignore children that don't exist
        unsigned mask = (1 << n->n.num_children) - 1;
        unsigned bitfield = _mm_movemask_epi8(cmp) & mask;

        // Check if less than any
        unsigned idx;
        if (bitfield) {
            idx = __builtin_ctz(bitfield);
            memmove(n->keys+idx+1,n->keys+idx,n->n.num_children-idx);
            memmove(n->children+idx+1,n->children+idx,
                    (n->n.num_children-idx)*sizeof(void*));
        } else
            idx = n->n.num_children;

        // Set the child
        n->keys[idx] = c;
        n->children[idx] = (art_node*)child;
        n->n.num_children++;

    } else {
        art_node48 *new_node = (art_node48*)alloc_node(NODE48);

        // Copy the child pointers and populate the key map
        memcpy(new_node->children, n->children,
                sizeof(void*)*n->n.num_children);
        for (int i=0;i<n->n.num_children;i++) {
            new_node->keys[n->keys[i]] = i + 1;
        }
        copy_header((art_node*)new_node, (art_node*)n);
        *ref = (art_node*)new_node;
        free(n);
        add_child48(new_node, ref, c, child);
    }
}
예제 #13
0
파일: lbp.cpp 프로젝트: arctanbell/libfr
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base)
{
    const signed char* src_data = (signed char*)(src->imageData + base);
    unsigned char * dst_data = (unsigned char*)(dst->imageData + base);
    const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep;
   
    __m128i pixels[3];

    // Load first two rows
    //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit
    src_data += src->widthStep;
    //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
    //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q);
    src_data += src->widthStep;

    int phase = 2;

    __m128i * phase_map[3][3] = {
        {pixels+1, pixels+2, pixels},
        {pixels+2, pixels, pixels+1},
        {pixels, pixels+1, pixels+2},
    };

    while (src_data < src_end)
    {
        register __m128i weight = ones.q;
        register __m128i code = _mm_setzero_si128();

        //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data));
        //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q);
        //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q);
        pixels[phase] = _mm_lddqu_si128((__m128i*)src_data);

        src_data += src->widthStep;
        dst_data += dst->widthStep;
        
        _mm_prefetch(src_data, _MM_HINT_T0);

        register __m128i a = *(phase_map[phase][0]);
        register __m128i b = *(phase_map[phase][1]);
        register __m128i c = *(phase_map[phase][2]);

        phase++;
        phase = (phase == 3) ? 0 : phase;
        
        // X . .   A
        // . o .   B
        // . . .   C
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . X .
        // .   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . X
        // .   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . . .
        // .   X
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // .   .
        // . . X
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);

        // . . .
        // .   .
        // . X .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // .   .
        // X . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight));
        weight = _mm_slli_epi64(weight, 1);
        
        // . . .
        // X   .
        // . . .
        code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); 

        _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write
    }
}
}bool validate_utf8_sse(const char *src, size_t len) {
  const char *end = src + len;
  while (src + 16 < end) {
    __m128i chunk = _mm_loadu_si128((const __m128i *)(src));

    int asciiMask = _mm_movemask_epi8(chunk);
    if (!asciiMask) {
      src += 16;
      continue;
    }

    __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
    __m128i cond2 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed);
    __m128i state = _mm_set1_epi8((char)(0x0 | 0x80));
    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2);

    __m128i cond3 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed);

    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3);
    __m128i mask3 = _mm_slli_si128(cond3, 1);

    __m128i cond4 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed);

    // Fall back to the scalar processing
    if (_mm_movemask_epi8(cond4)) {
      break;
    }

    __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));

    __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));

    __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1));

    __m128i shifts = count_sub1;
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
    counts = _mm_add_epi8(
        counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2));
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));

    if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0))))
      return false; // error
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));

    if (_mm_movemask_epi8(_mm_cmpgt_epi8(
            _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1))))
      return false; // error

    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));

    __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
    shifts =
        _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1

    chunk =
        _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));

    __m128i chunk_right = _mm_slli_si128(chunk, 1);

    __m128i chunk_low = _mm_blendv_epi8(
        chunk,
        _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6),
                                          _mm_set1_epi8(0xc0))),
        _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));

    __m128i chunk_high =
        _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
    chunk_high = _mm_srli_epi32(chunk_high, 2);

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
    chunk_high = _mm_or_si128(
        chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4),
                                                _mm_set1_epi8(0xf0)),
                                  mask3));
    int c = _mm_extract_epi16(counts, 7);
    int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14;

    __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8));
    if (!_mm_testz_si128(
            mask3,
            _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)),
                         _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8)))))
      return false;

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));

    chunk_high = _mm_slli_si128(chunk_high, 1);

    __m128i shuf =
        _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
                                          4, 3, 2, 1, 0));

    chunk_low = _mm_shuffle_epi8(chunk_low, shuf);
    chunk_high = _mm_shuffle_epi8(chunk_high, shuf);
    __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high);
    __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high);

    if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) |
        _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) {
      return false;
    }

    src += source_advance;
  }
  return validate_utf8(src, end - src);
}
예제 #15
0
FORCE_INLINE
static inline void fast_scan_1(const std::uint8_t* partition, const unsigned* labels,
		const float* dists, const __m128i (&min4)[4], __m128i (&ft4)[4][16],
		const float qmin, const float qmax, binheap* bh, unsigned scan_pqcode_count) {
	const unsigned simd_pqcode_count = 16;
	const int comp_block_size = 16;
	const unsigned simd_block_size = simd_pqcode_count * (4 * 1 + 4 * 0.5);
	const group_header* hdr;

	float bh_bound = qmax;
	__m128i bh_bound_quant = _mm_set1_epi8(Q127(bh_bound, qmin, bh_bound)); // CHK. Is 127

	for (;;) {
		// Parse group header
		hdr = reinterpret_cast<const group_header*>(partition);
		// Check if last group (All bits of size set to 1)
		if (hdr->size == std::numeric_limits<decltype(hdr->size)>::max()) {
			return;
		}
		partition += sizeof(*hdr);
		unsigned simd_block_count = (static_cast<unsigned>(hdr->size)
				+ simd_pqcode_count - 1) / simd_pqcode_count;
		// Load tables
		__m128i ft4_group[4];
		ft4_group[0] = ft4[0][hdr->values[0] >> 4];
		ft4_group[1] = ft4[1][hdr->values[1] >> 4];
		ft4_group[2] = ft4[2][hdr->values[2] >> 4];
		ft4_group[3] = ft4[3][hdr->values[3] >> 4];
		// Scan SIMD Blocks
		while (simd_block_count--) {
			const __m128i low_bits_mask = _mm_set_epi64x(0x0f0f0f0f0f0f0f0f,
					0x0f0f0f0f0f0f0f0f);

			// Component 0
			const __m128i comps_0 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition));
			const __m128i masked_comps_0 = _mm_and_si128(comps_0, low_bits_mask);
			__m128i candidates = _mm_shuffle_epi8(min4[0], masked_comps_0);
			// Components 1..3
			for (int comp_i = 1; comp_i < 4; ++comp_i) {
				const __m128i comps = _mm_loadu_si128(
						reinterpret_cast<const __m128i *>(partition
								+ comp_i * comp_block_size));
				const __m128i masked_comps = _mm_and_si128(comps, low_bits_mask);
				const __m128i partial = _mm_shuffle_epi8(min4[comp_i], masked_comps);
				candidates = _mm_adds_epi8(candidates, partial);
			}
			// Components 4-5
			__m128i comps_45 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition
							+ 4 * comp_block_size));
			const __m128i masked_comps_4 = _mm_and_si128(comps_45, low_bits_mask);
			const __m128i partial_4 = _mm_shuffle_epi8(ft4_group[0], masked_comps_4);
			candidates = _mm_adds_epi8(candidates, partial_4);

			comps_45 = _mm_srli_epi64(comps_45, 4);
			const __m128i masked_comps_5 = _mm_and_si128(comps_45, low_bits_mask);
			const __m128i partial_5 = _mm_shuffle_epi8(ft4_group[1], masked_comps_5);
			candidates = _mm_adds_epi8(candidates, partial_5);

			// Components 6-7
			__m128i comps_67 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition
							+ 5 * comp_block_size));
			const __m128i masked_comps_6 = _mm_and_si128(comps_67, low_bits_mask);
			const __m128i partial_6 = _mm_shuffle_epi8(ft4_group[2], masked_comps_6);
			candidates = _mm_adds_epi8(candidates, partial_6);

			const __m128i comps_7 = _mm_srli_epi64(comps_67, 4);
			const __m128i masked_comp_7 = _mm_and_si128(comps_7, low_bits_mask);
			const __m128i partial_7 = _mm_shuffle_epi8(ft4_group[3], masked_comp_7);
			candidates = _mm_adds_epi8(candidates, partial_7);

			// Compare
			const __m128i compare = _mm_cmplt_epi8(candidates, bh_bound_quant);
			int cmp = _mm_movemask_epi8(compare);
			//std::uint64_t cmp_low = (_mm_cvtsi128_si64(compare));
			//std::uint64_t cmp_high = (_mm_extract_epi64(compare, 1));

			// Compute current block size
			int current_block_actual_size = 0;
			if(simd_block_count == 0) {
				current_block_actual_size = hdr->size % simd_pqcode_count;
				if(current_block_actual_size == 0) {
					current_block_actual_size = simd_pqcode_count;
				} else {
					/*__m128i mask;
					compute_simd_mask(current_block_actual_size, mask);
					compare = _mm_and_si128(compare, mask);*/
					/*
					std::uint64_t low_mask;
					std::uint64_t high_mask;
					compute_high_low_mask(current_block_actual_size, low_mask, high_mask);
					cmp_low = cmp_low & low_mask;
					cmp_high = cmp_high & high_mask;
					*/
					cmp = cmp & BITMASK(current_block_actual_size);
				}
			} else {
				current_block_actual_size = simd_pqcode_count;
			}

			if(cmp) {
				// Check low quadword
				const std::uint8_t cmp_low = cmp & 0xff;
				if (cmp_low) {
					/*const std::uint64_t low_possible_positions = 0x0706050403020100;
					const std::uint64_t match_positions = _pext_u64(
							low_possible_positions, cmp_low);*/
					const int match_count = _popcnt32(cmp_low);
					std::uint64_t match_pos = masktable[cmp_low];


					for (int i = 0; i < match_count; ++i) {
						const std::uint8_t pos = match_pos & 0xff;
						match_pos >>= 8;
						const float candidate = scan_pqcode_in_simd_block_1(pos,
								partition, hdr->values, dists);
						if (candidate < bh_bound) {
							bh->push(labels[scan_pqcode_count + pos],
									candidate);
							bh_bound = bh->max();
							bh_bound_quant = _mm_set1_epi8(
									Q127(bh_bound, qmin, qmax));
						}
					}
				}

				// Check high quadword
				const std::uint8_t cmp_high = (cmp >> 8);
				if (cmp_high) {
					/*const std::uint64_t high_possible_positions = 0x0f0e0d0c0b0a0908;
					const std::uint64_t match_positions = _pext_u64(
							high_possible_positions, cmp_high);*/
					const int match_count = _popcnt32(cmp_high);
					std::uint64_t match_pos = masktable[cmp_high] + 0x0808080808080808;

					for (int i = 0; i < match_count; ++i) {
						const std::uint8_t pos = match_pos & 0xff;
						match_pos >>= 8;
						const float candidate = scan_pqcode_in_simd_block_1(pos,
								partition, hdr->values, dists);
						if (candidate < bh_bound) {
							bh->push(labels[scan_pqcode_count + pos],
									candidate);
							bh_bound = bh->max();
							bh_bound_quant = _mm_set1_epi8(
									Q127(bh_bound, qmin, qmax));
						}
					}
				}
			}

			partition += simd_block_size;
			scan_pqcode_count += current_block_actual_size;
		}
	}
}