예제 #1
static void
TEST (void)
      __m128i x;
      unsigned int i[4];
    } val[4];
  int i, j, l;
  int res[32];

  val[0].i[0] = 0x11111111;
  val[0].i[1] = 0x00000000;
  val[0].i[2] = 0x00000000;
  val[0].i[3] = 0x11111111;
  val[1].i[0] = 0x00000000;
  val[1].i[1] = 0x11111111;
  val[1].i[2] = 0x11111111;
  val[1].i[3] = 0x00000000;

  val[2].i[0] = 0;
  val[2].i[1] = 0;
  val[2].i[2] = 0;
  val[2].i[3] = 0;

  val[3].i[0] = 0xffffffff;
  val[3].i[1] = 0xffffffff;
  val[3].i[2] = 0xffffffff;
  val[3].i[3] = 0xffffffff;

  l = 0;
  for(i = 0; i < 4; i++)
    for(j = 0; j < 4; j++)
	res[l++] = _mm_testz_si128 (val[j].x, val[i].x);
	res[l++] = _mm_testc_si128 (val[j].x, val[i].x);

  l = 0;
  for(i = 0; i < 4; i++)
    for(j = 0; j < 4; j++)
	if (res[l++] != make_ptestz (val[j].x, val[i].x))
	  abort ();
	if (res[l++] != make_ptestc (val[j].x, val[i].x))
	  abort ();

  if (res[2] != _mm_testz_si128 (val[1].x, val[0].x))
    abort ();

  if (res[3] != _mm_testc_si128 (val[1].x, val[0].x))
    abort ();
예제 #2
 SIMD_INLINE bool RowHasIndex(const uint8_t * mask, size_t alignedSize, size_t fullSize, __m128i index)
     for (size_t col = 0; col < alignedSize; col += A)
         if(!_mm_testz_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + col)), index), K_INV_ZERO))
             return true;
     if(alignedSize != fullSize)
         if(!_mm_testz_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + fullSize - A)), index), K_INV_ZERO))
             return true;
     return false;
예제 #3
  // これはコードが見難くなるけど仕方ない。
  bool andIsNot0(const Bitboard& bb) const {
#ifdef HAVE_SSE4
    return !(_mm_testz_si128(this->m_, bb.m_));
    return (*this & bb).isNot0();
예제 #4
 SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m128i index, uint8_t * cols)
     __m128i _cols = _mm_setzero_si128();
     for (size_t row = 0; row < size; ++row)
         _cols = _mm_or_si128(_cols, _mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)mask), index));
         mask += stride;
     _mm_storeu_si128((__m128i*)cols, _cols);
     return !_mm_testz_si128(_cols, K_INV_ZERO);
예제 #5
파일: DRS.c 프로젝트: doge427538/UVa-1
int32_t search_range(Rect rect, int32_t x[], int32_t y[], 
		int32_t w[], int32_t n) {
	int32_t ret = 0;
	for (int i = 0; i < n; i++) {
		if (rect.lx <= x[i] && x[i] <= rect.rx &&
			rect.ly <= y[i] && y[i] <= rect.ry) {
			ret += w[i];
	return ret;
int32_t search_range(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) {
	__m128i ret = _mm_set_epi32(0, 0, 0, 0);
	rect.lx--, rect.ly--;
	rect.rx++, rect.ry++;
	__m128i lx = _mm_broadcastd_epi32(*((__m128i *) &rect.lx));
	__m128i ly = _mm_broadcastd_epi32(*((__m128i *) &rect.ly));
	__m128i rx = _mm_broadcastd_epi32(*((__m128i *) &rect.rx));
	__m128i ry = _mm_broadcastd_epi32(*((__m128i *) &rect.ry));
	__m128i zo = _mm_set_epi32(0, 0, 0, 0);
	__m128i ic = _mm_set_epi32(3, 2, 1, 0);
	for (int i = 0; i+4 <= n; i += 4) {
		__m128i sx = _mm_load_si128((__m128i *) (x+i));
		__m128i sy = _mm_load_si128((__m128i *) (y+i));
		__m128i c1 = _mm_and_si128(_mm_cmplt_epi32(lx, sx), _mm_cmplt_epi32(sx, rx));
		__m128i c2 = _mm_and_si128(_mm_cmplt_epi32(ly, sy), _mm_cmplt_epi32(sy, ry));
		if (_mm_testz_si128(c1, c2) == 0) {
			__m128i cc = _mm_and_si128(c1, c2);
			__m128i vi = _mm_add_epi32(ic, _mm_set_epi32(i, i, i, i));
			__m128i rs = _mm_mask_i32gather_epi32(zo, w+i, ic, cc, 4);
			ret = _mm_add_epi32(ret, rs);
	int32_t sum = 0;
	for (int i = (n>>2)<<2; i < n; i++) {
		if (rect.lx <= x[i] && x[i] <= rect.rx &&
			rect.ly <= y[i] && y[i] <= rect.ry) {
			sum += w[i];
	static int32_t tmp[4] __attribute__ ((aligned (16)));
	_mm_store_si128((__m128i*) &tmp[0], ret);
	sum += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	return sum;
예제 #6
파일: logical.hpp 프로젝트: dlevin256/kfr
KFR_SINTRIN bool bittestany(const i64sse& x) { return !_mm_testz_si128(*x, *x); }
예제 #7
파일: logical.hpp 프로젝트: dlevin256/kfr
KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
}bool validate_utf8_sse(const char *src, size_t len) {
  const char *end = src + len;
  while (src + 16 < end) {
    __m128i chunk = _mm_loadu_si128((const __m128i *)(src));

    int asciiMask = _mm_movemask_epi8(chunk);
    if (!asciiMask) {
      src += 16;

    __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
    __m128i cond2 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed);
    __m128i state = _mm_set1_epi8((char)(0x0 | 0x80));
    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2);

    __m128i cond3 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed);

    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3);
    __m128i mask3 = _mm_slli_si128(cond3, 1);

    __m128i cond4 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed);

    // Fall back to the scalar processing
    if (_mm_movemask_epi8(cond4)) {

    __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));

    __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));

    __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1));

    __m128i shifts = count_sub1;
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
    counts = _mm_add_epi8(
        counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2));
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));

    if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0))))
      return false; // error
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));

    if (_mm_movemask_epi8(_mm_cmpgt_epi8(
            _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1))))
      return false; // error

    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));

    __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
    shifts =
        _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1

    chunk =
        _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));

    __m128i chunk_right = _mm_slli_si128(chunk, 1);

    __m128i chunk_low = _mm_blendv_epi8(
        _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6),
        _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));

    __m128i chunk_high =
        _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
    chunk_high = _mm_srli_epi32(chunk_high, 2);

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
    chunk_high = _mm_or_si128(
        chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4),
    int c = _mm_extract_epi16(counts, 7);
    int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14;

    __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8));
    if (!_mm_testz_si128(
            _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)),
                         _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8)))))
      return false;

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));

    chunk_high = _mm_slli_si128(chunk_high, 1);

    __m128i shuf =
        _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
                                          4, 3, 2, 1, 0));

    chunk_low = _mm_shuffle_epi8(chunk_low, shuf);
    chunk_high = _mm_shuffle_epi8(chunk_high, shuf);
    __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high);
    __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high);

    if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) |
        _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) {
      return false;

    src += source_advance;
  return validate_utf8(src, end - src);
예제 #9
int test_mm_testz_si128(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_testz_si128
  // CHECK: call i32 @llvm.x86.sse41.ptestz
  // CHECK-ASM: ptest %xmm{{.*}}, %xmm{{.*}}
  return _mm_testz_si128(x, y);
예제 #10
int test_mm_testz_si128(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_testz_si128
  // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_testz_si128(x, y);