Esempio n. 1
0
uint32_t sse_sumbytes_variant2(uint8_t* array, size_t size) {

    const __m128i lobyte_mask = _mm_set1_epi32(0x000000ff);

    __m128i accumulator = _mm_setzero_si128();

    for (size_t i=0; i < size; i += 16) {
        const __m128i v = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i v0 = v;
        const __m128i v1 = _mm_srli_epi32(v, 1*8);
        const __m128i v2 = _mm_srli_epi32(v, 2*8);
        const __m128i v3 = _mm_srli_epi32(v, 3*8);
        
        const __m128i t0 = _mm_and_si128(lobyte_mask, v0);
        const __m128i t1 = _mm_and_si128(lobyte_mask, v1);
        const __m128i t2 = _mm_and_si128(lobyte_mask, v2);
        const __m128i t3 = v3;

        accumulator = _mm_add_epi32(accumulator, t0);
        accumulator = _mm_add_epi32(accumulator, t1);
        accumulator = _mm_add_epi32(accumulator, t2);
        accumulator = _mm_add_epi32(accumulator, t3);
    }

    return uint32_t(_mm_extract_epi32(accumulator, 0)) +
           uint32_t(_mm_extract_epi32(accumulator, 1)) +
           uint32_t(_mm_extract_epi32(accumulator, 2)) +
           uint32_t(_mm_extract_epi32(accumulator, 3));
}
Esempio n. 2
0
uint32_t sse_sumbytes(uint8_t* array, size_t size) {

    __m128i accumulator = _mm_setzero_si128();

    for (size_t i=0; i < size; i += 16) {
        const __m128i v = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i v0_3   = v;
        const __m128i v4_7   = _mm_bsrli_si128(v, 1*4);
        const __m128i v8_11  = _mm_bsrli_si128(v, 2*4);
        const __m128i v12_15 = _mm_bsrli_si128(v, 3*4);
        
        const __m128i t0 = _mm_cvtepu8_epi32(v0_3);
        const __m128i t1 = _mm_cvtepu8_epi32(v4_7);
        const __m128i t2 = _mm_cvtepu8_epi32(v8_11);
        const __m128i t3 = _mm_cvtepu8_epi32(v12_15);

        const __m128i t01 = _mm_add_epi32(t0, t1);
        const __m128i t23 = _mm_add_epi32(t2, t3);

        accumulator = _mm_add_epi32(accumulator, t01);
        accumulator = _mm_add_epi32(accumulator, t23);
    }

    return uint32_t(_mm_extract_epi32(accumulator, 0)) +
           uint32_t(_mm_extract_epi32(accumulator, 1)) +
           uint32_t(_mm_extract_epi32(accumulator, 2)) +
           uint32_t(_mm_extract_epi32(accumulator, 3));
}
Esempio n. 3
0
uint32_t probe(uint32_t key)
{
  /* create a vector with all values initialized to key */
  __m128i keyVector = _mm_set1_epi32(key);

  /* find the appropriate buckets using multiplicative hashing */
  __m128i bucketIds = _mm_mullo_epi32(keyVector, hashes.vec128);
  bucketIds  = _mm_srli_epi32(bucketIds, hashShift);
  size_t b0 = _mm_extract_epi32(bucketIds, 0);
  size_t b1 = _mm_extract_epi32(bucketIds, 1);

  __m128i keys;
  __m128i values0, values1;

  /* load keys, compare with lookup key (to produce a bitmask).
   * AND the result with the corresponding values. */
  keys = _mm_load_si128((const __m128i *) buckets[b0].keys);
  keys = _mm_cmpeq_epi32(keys, keyVector);
  values0 = _mm_load_si128((const __m128i *) buckets[b0].values);
  values0 = _mm_and_si128(values0, keys);

  keys = _mm_load_si128((const __m128i *) buckets[b1].keys);
  keys = _mm_cmpeq_epi32(keys, keyVector);
  values1 = _mm_load_si128((const __m128i *) buckets[b1].values);
  values1 = _mm_and_si128(values1, keys);

  /* OR all of the (key AND value) pairs to get result */
  union QuadInt qi;
  qi.vec128 = _mm_or_si128(values0, values1);
  qi.vec64[0] = _mm_or_si64(qi.vec64[0], qi.vec64[1]);
  return qi.arr[0] | qi.arr[1];
}
Esempio n. 4
0
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16) {
        const __m128i v  = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i m  = _mm_cmplt_epi8(v, zero);
        const __m128i va = _mm_abs_epi8(v);

        // sum just positive numbers
        const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);

        // sum just negative numbers
        const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
        
        positive = _mm_add_epi32(positive, t0);
        negative = _mm_sub_epi32(negative, t1);
    }

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
}
Esempio n. 5
0
void t_print__m128i (__m128i a) 
{
	printf("%d = %d\n", 0, _mm_extract_epi32(a, 0));
	printf("%d = %d\n", 1, _mm_extract_epi32(a, 1));
	printf("%d = %d\n", 2, _mm_extract_epi32(a, 2));
	printf("%d = %d\n", 3, _mm_extract_epi32(a, 3));
	printf("\n");
}
Esempio n. 6
0
int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16*4) {
        const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16));
        const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16));
        const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16));
        const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16));

        {
            const __m128i v   = v0;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v1;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v2;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }

        {
            const __m128i v   = v3;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);
        }
    }

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
}
static inline void arr_store_col(
        int *col,
        __m128i vH,
        int32_t t,
        int32_t seglen)
{
    col[0*seglen+t] = (int32_t)_mm_extract_epi32(vH, 0);
    col[1*seglen+t] = (int32_t)_mm_extract_epi32(vH, 1);
    col[2*seglen+t] = (int32_t)_mm_extract_epi32(vH, 2);
    col[3*seglen+t] = (int32_t)_mm_extract_epi32(vH, 3);
}
Esempio n. 8
0
static inline void do_encode_6bytes(const char (*alphabet)[2], char *out, __m128i chunk)
{
    uint32_t v0, v1, v2, v3;
    v0 = _mm_extract_epi32(chunk, 0);
    v1 = _mm_extract_epi32(chunk, 1);
    v2 = _mm_extract_epi32(chunk, 2);
    v3 = _mm_extract_epi32(chunk, 3);
    memcpy(out + 0, alphabet[v0], 2);
    memcpy(out + 2, alphabet[v1], 2);
    memcpy(out + 4, alphabet[v2], 2);
    memcpy(out + 6, alphabet[v3], 2);
}
static inline void arr_store_si128(
        int *array,
        __m128i vH,
        int32_t t,
        int32_t seglen,
        int32_t d,
        int32_t dlen)
{
    array[1LL*(0*seglen+t)*dlen + d] = (int32_t)_mm_extract_epi32(vH, 0);
    array[1LL*(1*seglen+t)*dlen + d] = (int32_t)_mm_extract_epi32(vH, 1);
    array[1LL*(2*seglen+t)*dlen + d] = (int32_t)_mm_extract_epi32(vH, 2);
    array[1LL*(3*seglen+t)*dlen + d] = (int32_t)_mm_extract_epi32(vH, 3);
}
// useful for debugging
inline bool equal128(__m128i x, __m128i y)
{
    return ((_mm_extract_epi32(x,0) == _mm_extract_epi32(y,0)) &&
	    (_mm_extract_epi32(x,1) == _mm_extract_epi32(y,1)) &&
	    (_mm_extract_epi32(x,2) == _mm_extract_epi32(y,2)) &&
	    (_mm_extract_epi32(x,3) == _mm_extract_epi32(y,3)));
}
// credit: Harold Aptroot
uint32_t maskedvectorsum(uint32_t * z, uint32_t N, uint32_t * accesses,
     uint32_t nmbr) {
  __m256i Nvec = _mm256_set1_epi32(N - 1);
  __m256i sum = _mm256_setzero_si256();
  for(uint32_t j = 0; j < nmbr ; j += 8) {
     __m256i indexes = _mm256_loadu_si256((__m256i*)(accesses + j));
     indexes = _mm256_and_si256(indexes, Nvec);
     __m256i fi = _mm256_i32gather_epi32((int*)z, indexes, 4);
     sum = _mm256_add_epi32(sum, fi);
  }
  __m128i sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 0), _mm256_extracti128_si256(sum, 1));
  sum128 = _mm_hadd_epi32(sum128, sum128);
  return _mm_extract_epi32(sum128, 0) + _mm_extract_epi32(sum128, 1);
}
Esempio n. 12
0
static inline void arr_store_rowcol(
        int *row,
        int *col,
        __m128i vWH,
        int32_t i,
        int32_t s1Len,
        int32_t j,
        int32_t s2Len)
{
    if (i+0 == s1Len-1 && 0 <= j-0 && j-0 < s2Len) {
        row[j-0] = (int32_t)_mm_extract_epi32(vWH, 3);
    }
    if (j-0 == s2Len-1 && 0 <= i+0 && i+0 < s1Len) {
        col[(i+0)] = (int32_t)_mm_extract_epi32(vWH, 3);
    }
    if (i+1 == s1Len-1 && 0 <= j-1 && j-1 < s2Len) {
        row[j-1] = (int32_t)_mm_extract_epi32(vWH, 2);
    }
    if (j-1 == s2Len-1 && 0 <= i+1 && i+1 < s1Len) {
        col[(i+1)] = (int32_t)_mm_extract_epi32(vWH, 2);
    }
    if (i+2 == s1Len-1 && 0 <= j-2 && j-2 < s2Len) {
        row[j-2] = (int32_t)_mm_extract_epi32(vWH, 1);
    }
    if (j-2 == s2Len-1 && 0 <= i+2 && i+2 < s1Len) {
        col[(i+2)] = (int32_t)_mm_extract_epi32(vWH, 1);
    }
    if (i+3 == s1Len-1 && 0 <= j-3 && j-3 < s2Len) {
        row[j-3] = (int32_t)_mm_extract_epi32(vWH, 0);
    }
    if (j-3 == s2Len-1 && 0 <= i+3 && i+3 < s1Len) {
        col[(i+3)] = (int32_t)_mm_extract_epi32(vWH, 0);
    }
}
Esempio n. 13
0
int	vector_ps_short (const short* pa,const short* pb,size_t n)
{
    size_t k;
    size_t q = n / 16;
    size_t r = n % 16;
    int w;
    if (q > 0) {
	__m128i acc1 = _mm_setzero_si128();
	__m128i acc2 = _mm_setzero_si128();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_load_si128((__m128i*)pa);
		__m128i b1 = _mm_load_si128((__m128i*)pb);
		__m128i a2 = _mm_load_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_load_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i s1 = _mm_madd_epi16(a1,b1);
		__m128i s2 = _mm_madd_epi16(a2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc1 = _mm_add_epi32(acc1,s1);
		acc2 = _mm_add_epi32(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_loadu_si128((__m128i*)pa);
		__m128i b1 = _mm_loadu_si128((__m128i*)pb);
		__m128i a2 = _mm_loadu_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_loadu_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i s1 = _mm_madd_epi16(a1,b1);
		__m128i s2 = _mm_madd_epi16(a2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc1 = _mm_add_epi32(acc1,s1);
		acc2 = _mm_add_epi32(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_epi32(acc1,acc2);
	acc1 = _mm_hadd_epi32(acc1,acc1);
	acc1 = _mm_hadd_epi32(acc1,acc1);
	w = _mm_extract_epi32(acc1,0);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}
Esempio n. 14
0
File: main.cpp Progetto: CCJY/coliru
uint seqRank ( uint * vector , byte searchedByte , uint position ){
    register uint i , cont = 0;
    __m128i patt , window , returnValue ;
    byte * c1 , patt_code [16];
    uint d = position > >4 , r = position & 0 xf ;
    for ( i =0; i <16; i ++)
        patt_code [i ]= searchedByte ;
    long long * pat_array = ( long long *) patt_code ;
    patt = _mm_set_epi64x ( pat_array [1] , pat_array [0]) ;
    long long * text_array = ( long long *) vector ;
    for ( i =0; i <d; i ++) {
        window = _mm_set_epi64x ( text_array [1] , text_array [0]) ;
        returnValue = _mm_cmpestrm ( patt , 16 , window , 16 , mode ) ;
        cont += _mm_popcnt_u32 ( _mm_extract_epi32 ( returnValue ,0) );
        text_array += 2;
    }
    window = _mm_set_epi64x ( text_array [1] , text_array [0]) ;
    returnValue = _mm_cmpestrm ( patt , r , window , r , mode );
    cont += _mm_popcnt_u32 ( _mm_extract_epi32 ( returnValue ,0) ) +r -16;
    return cont ;
}
Esempio n. 15
0
static inline void arr_store_si128(
        int *array,
        __m128i vWH,
        int32_t i,
        int32_t s1Len,
        int32_t j,
        int32_t s2Len)
{
    if (0 <= i+0 && i+0 < s1Len && 0 <= j-0 && j-0 < s2Len) {
        array[1LL*(i+0)*s2Len + (j-0)] = (int32_t)_mm_extract_epi32(vWH, 3);
    }
    if (0 <= i+1 && i+1 < s1Len && 0 <= j-1 && j-1 < s2Len) {
        array[1LL*(i+1)*s2Len + (j-1)] = (int32_t)_mm_extract_epi32(vWH, 2);
    }
    if (0 <= i+2 && i+2 < s1Len && 0 <= j-2 && j-2 < s2Len) {
        array[1LL*(i+2)*s2Len + (j-2)] = (int32_t)_mm_extract_epi32(vWH, 1);
    }
    if (0 <= i+3 && i+3 < s1Len && 0 <= j-3 && j-3 < s2Len) {
        array[1LL*(i+3)*s2Len + (j-3)] = (int32_t)_mm_extract_epi32(vWH, 0);
    }
}
Esempio n. 16
0
static void
sfid_render_cache_rt_write_simd8_r_uint8_ymajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    const int cpp = 1;

    void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp);

    struct reg *src = &t->grf[args->src];

    __m256i r32 = _mm256_permute4x64_epi64(src[0].ireg, SWIZZLE(0, 2, 1, 3));

    __m128i lo = _mm256_extractf128_si256(r32, 0);
    __m128i hi = _mm256_extractf128_si256(r32, 1);
    __m128i r16 = _mm_packus_epi32(lo, hi);
    __m128i r8 = _mm_packus_epi16(r16, r16);

    /* FIXME: Needs masking. */
    *(uint32_t *) (base +  0) = _mm_extract_epi32(r8, 0);
    *(uint32_t *) (base + 16) = _mm_extract_epi32(r8, 1);
}
Esempio n. 17
0
static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
                                                     unsigned int *const sse) {
  // extract the low lane and add it to the high lane
  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);

  // unpack sse and sum registers and add
  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);

  // perform the final summation and extract the results
  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
  *((int *)sse) = _mm_cvtsi128_si32(res);
  return _mm_extract_epi32(res, 1);
}
Esempio n. 18
0
static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}
/*****************************************************************************
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
 *****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
#else
      __m128i v_blocka[1];
#endif

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
        continue;
      }

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
      {
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
                                 _mm_cvtepi32_epi64(v_inside_d));
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
#endif
      }

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,
                     (uint32_t*)&v_sad_d);

      // Look up the component cost of the residual motion vector
      {
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
      }

      // Now add in the joint cost
      {
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
                                                _mm_setzero_si128());
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
                                                       v_joint_cost_0_d,
                                                       v_sel_d);
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
      }

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
      {
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
        }

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;
        }
      }
    }

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

    if (__unlikely__(best_address == in_what)) {
      (*num00)++;
    }
  }

  *best_mv = bmv.as_mv;
  return best_sad;
}
Esempio n. 20
0
unsigned long long int foo32(__m128i x)
{ 
  return (unsigned int) _mm_extract_epi32(x, 2);
}
Esempio n. 21
0
ARGB32 SoftTexture2D::Sample_Point_Wrap( F4 u, F4 v ) const
{
	//F4 fU = clampf(u,0,1);
	//F4 fV = clampf(v,0,1);

	F4 fWidth = SIZE_X;
	F4 fHeight = SIZE_Y;

	//
	const int iU = iround( u * fWidth ) & (SIZE_X-1);	// [0..1] -> [0..H]
	const int iV = iround( v * fHeight ) & (SIZE_Y-1);	// [0..1] -> [0..H]

	return m_data.ToPtr()[ iV * SIZE_X + iU ];
/*
	// 28.4 fixed-point coordinates
	const INT32 FU = iround( 16.0f * u );
	const INT32 FV = iround( 16.0f * v );

	enum { FIX_POINT_PRE = 9 };

	const UINT offset =
		(( ( FU & t->textureYMask ) >> FIX_POINT_PRE ) << t->pitchlog2)
		| (( FV & t->textureXMask ) >> FIX_POINT_PRE)
		;

	//*(span++) += textureBuffer[((iv>>10)&0xffffffC0) + (iu>>16)];

*/

#if 0
	float4 res;
	__m128i tU, tV;

	__m128 conv = _mm_rcp_ps( f255 );

	// Formula for computing U and V:
	// tX = (int)( min(iX/iW, 1.0f) * (tex_width - 1) );
	{
		__m128 t2 = _mm_mul_ps( MSR_Wrap(u), _mm_set_ps1((float)(tex->clip_rect.w-1)) );
		__m128 t3 = _mm_mul_ps( MSR_Wrap(v), _mm_set_ps1((float)(tex->clip_rect.h-1)) );

		tU = _mm_cvtps_epi32(t2);
		tV = _mm_cvtps_epi32(t3);
	}

	// tSample = tV * pitch + tU * bytesPerPixel
	__m128i tIdx = _mm_add_epi32( mul_epi32(tU, _mm_set1_epi32(tex->format->BytesPerPixel)), mul_epi32(tV, _mm_set1_epi32(tex->pitch)));

	// Since SSE doesn't support arbitrary indexing out of an array, we have to extract the indexes,
	// grab the sample, and recreate an SSE register with the new samples.
	Uint8 *ptr = (Uint8*&)tex->pixels;
	Uint8 *sample3 = &ptr[_mm_extract_epi32(tIdx, 3)];
	Uint8 *sample2 = &ptr[_mm_extract_epi32(tIdx, 2)];
	Uint8 *sample1 = &ptr[_mm_extract_epi32(tIdx, 1)];
	Uint8 *sample0 = &ptr[_mm_extract_epi32(tIdx, 0)];

	__m128i tSample = _mm_set_epi32( *(Uint32*)sample3, *(Uint32*)sample2, *(Uint32*)sample1, *(Uint32*)sample0 );

	// Finally, grab each of the channels out by shifting and masking.
	res.r = _mm_cvtepi32_ps(_mm_srl_epi32( _mm_and_si128( tSample, _mm_set1_epi32(tex->format->Rmask) ), _mm_set_epi32(0, 0, 0, tex->format->Rshift) ) );
	res.g = _mm_cvtepi32_ps(_mm_srl_epi32( _mm_and_si128( tSample, _mm_set1_epi32(tex->format->Gmask) ), _mm_set_epi32(0, 0, 0, tex->format->Gshift) ) );
	res.b = _mm_cvtepi32_ps(_mm_srl_epi32( _mm_and_si128( tSample, _mm_set1_epi32(tex->format->Bmask) ), _mm_set_epi32(0, 0, 0, tex->format->Bshift) ) );

	*res.r = _mm_mul_ps( *res.r, conv );
	*res.g = _mm_mul_ps( *res.g, conv );
	*res.b = _mm_mul_ps( *res.b, conv );

	return res;
#endif
}
Esempio n. 22
0
static inline int32_t _mm_hmax_epi32_rpl(__m128i a) {
    a = _mm_max_epi32(a, _mm_srli_si128(a, 8));
    a = _mm_max_epi32(a, _mm_srli_si128(a, 4));
    return _mm_extract_epi32(a, 0);
}
Esempio n. 23
0
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}
Esempio n. 24
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
Esempio n. 25
0
long long int foo32(__m128i x) { return (int) _mm_extract_epi32(x, 2); }
int check(size_t N, size_t Nq) {
    int * queries = (int*)malloc(Nq*sizeof(int));
    int * source = (int*)malloc(N*sizeof(int));
    size_t i, k;
    int displaytest = 0;
    for(i = 0; i < N; ++i) {
        source[i] = rand();
    }
    qsort (source, N, sizeof(int), compare);
    if(displaytest) {
        for(i = 0; i < N; ++i) {
            printf(" %d ",source[i]);
        }
        printf("\n");
    }
    int maxval = source[N-1];
    for(i = 0; i < Nq; ++i) {
        queries[i] = rand()%(maxval+1);
    }
    for(k = 0; k < Nq; ++k)
        if(branchy_search(source,N,queries[k]) != branchfree_search(source,N,queries[k])) {
            printf("bug1\n");
            free(source);
            free(queries);

            return -1;
        }
    for(k = 0; k+1 < Nq; k+=2) {
        size_t i1, i2;
        branchfree_search2(source,N,queries[k],queries[k+1],&i1,&i2);
        if(branchfree_search(source,N,queries[k]) != i1) {
            printf("bug2\n");
            free(source);
            free(queries);

            return -1;
        }
        if(branchfree_search(source,N,queries[k+1]) != i2) {
            printf("bug3\n");
            free(source);
            free(queries);

            return -1;
        }
    }
#ifdef MYAVX

    for(k = 0; k+3 < Nq; k+=4) {
        size_t i1, i2, i3, i4;
        __m128i q = _mm_lddqu_si128((__m128i const*)(queries +k));
        __m128i bog = branchfree_search4_avx(source,N,q);
        branchfree_search4(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],&i1,&i2,&i3,&i4);
        if((_mm_extract_epi32(bog,0)!= i1) || (_mm_extract_epi32(bog,1)!= i2) || (_mm_extract_epi32(bog,2)!= i3) || (_mm_extract_epi32(bog,3)!= i4)) {
            printf("bug3\n");
            printf("%zu %zu %zu %zu\n",i1,i2,i3,i4);
            printf("%d %d %d %d\n",_mm_extract_epi32(bog,0),_mm_extract_epi32(bog,1),_mm_extract_epi32(bog,2),_mm_extract_epi32(bog,3));
            return -1;
        }
    }
#endif

    free(source);
    free(queries);
    return 0;
}
int demo(size_t N, size_t Nq) {
    int * queries = (int*)malloc(Nq*sizeof(int));
    int * source = (int*)malloc(N*sizeof(int));
    size_t bogus = 0;
    size_t bogus1 = 0;
    size_t bogus2 = 0;
    size_t bogus3 = 0;
    size_t bogus4 = 0;
    __m128i bog = _mm_setzero_si128();
    size_t i, k, ti;
    printf("===============\n");

    printf("array size (N)=%zu,  number of queries (Nq)=%zu...\n",N,Nq);
    printf("preparing data...\n");
    for(i = 0; i < N; ++i) {
        source[i] = rand();
    }
    qsort (source, N, sizeof(int), compare);

    int maxval = source[N-1];
    for(i = 0; i < Nq; ++i) {
        queries[i] = rand()%(maxval+1);
    }
    printf("beginning tests...\n");
    printf("\n");

    for(ti = 0; ti < 3; ++ti) {
        struct timeval t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13;

        gettimeofday(&t6, 0);
        for(k = 0; k+1 < Nq; k+=2)
            branchfree_search2_prefetch(source,N,queries[k],queries[k+1],&bogus1,&bogus2);
        gettimeofday(&t1, 0);
        for(k = 0; k < Nq; ++k)
            bogus += branchfree_search(source,N,queries[k]);
        gettimeofday(&t2, 0);
        for(k = 0; k < Nq; ++k)
            bogus += branchy_search(source,N,queries[k]);
        gettimeofday(&t3, 0);
        for(k = 0; k < Nq; ++k)
            bogus += branchfree_search_prefetch(source,N,queries[k]);
        gettimeofday(&t4, 0);
        for(k = 0; k+1 < Nq; k+=2)
            branchfree_search2(source,N,queries[k],queries[k+1],&bogus1,&bogus2);
        gettimeofday(&t5, 0);
        for(k = 0; k+3 < Nq; k+=4)
            branchfree_search4(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],&bogus1,&bogus2,&bogus3,&bogus4);
        gettimeofday(&t7, 0);
        for(k = 0; k+3 < Nq; k+=4)
            branchfree_search4_prefetch(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],&bogus1,&bogus2,&bogus3,&bogus4);
        gettimeofday(&t8, 0);
#ifdef MYAVX
        for(k = 0; k+3 < Nq; k+=4) {
            __m128i q = _mm_lddqu_si128((__m128i const*)(queries +k));
            bog = _mm_add_epi32(bog,branchfree_search4_avx(source,N,q));
        }
        gettimeofday(&t9, 0);

        for(k = 0; k+7 < Nq; k+=8) {
            __m256i q = _mm256_lddqu_si256((__m256i const*)(queries +k));
            bog = _mm_add_epi32(bog,_mm256_castsi256_si128(branchfree_search8_avx(source,N,q)));
        }

#endif
        gettimeofday(&t10, 0);
        for(k = 0; k+7 < Nq; k+=8) {
            branchfree_search8(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],queries[k+4],queries[k+5],queries[k+6],queries[k+7],&bogus1,&bogus2,&bogus3,&bogus4,&bogus1,&bogus2,&bogus3,&bogus4);
        }
        gettimeofday(&t11, 0);
        for(k = 0; k+7 < Nq; k+=8) {
            branchfree_search8_prefetch(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],queries[k+4],queries[k+5],queries[k+6],queries[k+7],&bogus1,&bogus2,&bogus3,&bogus4,&bogus1,&bogus2,&bogus3,&bogus4);
        }
        gettimeofday(&t12, 0);
        for(k = 0; k < Nq; ++k)
            bogus += hackedbranchfree_search(source,N,queries[k]);
        gettimeofday(&t13, 0);



        printf("branchless time=%llu  \n",t2.tv_sec  * 1000ULL * 1000ULL + t2.tv_usec - (t1.tv_sec  * 1000ULL * 1000ULL + t1.tv_usec));
        printf("branchy time=%llu  \n",t3.tv_sec  * 1000ULL * 1000ULL + t3.tv_usec - (t2.tv_sec  * 1000ULL * 1000ULL + t2.tv_usec));
        printf("branchless time with prefetch=%llu \n",t4.tv_sec  * 1000ULL * 1000ULL + t4.tv_usec - (t3.tv_sec  * 1000ULL * 1000ULL + t3.tv_usec));
        printf("branchless interleaved (2) time=%llu  \n",t5.tv_sec  * 1000ULL * 1000ULL + t5.tv_usec - (t4.tv_sec  * 1000ULL * 1000ULL + t4.tv_usec));
        printf("branchless interleaved (2) (prefetch) time=%llu  \n",t1.tv_sec  * 1000ULL * 1000ULL + t1.tv_usec - (t6.tv_sec  * 1000ULL * 1000ULL + t6.tv_usec));
        printf("branchless interleaved (4) time=%llu  \n",t7.tv_sec  * 1000ULL * 1000ULL + t7.tv_usec - (t5.tv_sec  * 1000ULL * 1000ULL + t5.tv_usec));
        printf("branchless interleaved (4) (prefetch) time=%llu  \n",t8.tv_sec  * 1000ULL * 1000ULL + t8.tv_usec - (t7.tv_sec  * 1000ULL * 1000ULL + t7.tv_usec));
#ifdef MYAVX
        printf("branchless interleaved (4) (AVX) time=%llu  \n",t9.tv_sec  * 1000ULL * 1000ULL + t9.tv_usec - (t8.tv_sec  * 1000ULL * 1000ULL + t8.tv_usec));
        printf("branchless interleaved (8) (AVX) time=%llu  \n",t10.tv_sec  * 1000ULL * 1000ULL + t10.tv_usec - (t9.tv_sec  * 1000ULL * 1000ULL + t9.tv_usec));
#endif
        printf("branchless interleaved (8) time=%llu  \n",t11.tv_sec  * 1000ULL * 1000ULL + t11.tv_usec - (t10.tv_sec  * 1000ULL * 1000ULL + t10.tv_usec));
        printf("branchless interleaved (8) (prefetch) time=%llu  \n",t12.tv_sec  * 1000ULL * 1000ULL + t12.tv_usec - (t11.tv_sec  * 1000ULL * 1000ULL + t11.tv_usec));
        printf("hacked branchless time=%llu  \n",t13.tv_sec  * 1000ULL * 1000ULL + t13.tv_usec - (t12.tv_sec  * 1000ULL * 1000ULL + t12.tv_usec));



        printf("\n");
    }
#ifdef MYAVX
    bogus += _mm_extract_epi32(bog,0);
#endif

    free(source);
    free(queries);
    return (int) bogus+bogus1+bogus2+bogus3+bogus4;

}
void print(__m128i bog) {
    printf("%u %u %u %u \n",_mm_extract_epi32(bog,0),_mm_extract_epi32(bog,1),_mm_extract_epi32(bog,2),_mm_extract_epi32(bog,3));

}
Esempio n. 29
0
float	vector_cos_short (const short* pa,const short* pb,size_t n)
{
    size_t k;
    double norm;
    size_t q = n / 16;
    size_t r = n % 16;
    int ps,na,nb;
    if (q > 0) {
        __m128i acc;
	__m128i acc_ps1 = _mm_setzero_si128();
	__m128i acc_ps2 = _mm_setzero_si128();
	__m128i acc_na1 = _mm_setzero_si128();
	__m128i acc_na2 = _mm_setzero_si128();
	__m128i acc_nb1 = _mm_setzero_si128();
	__m128i acc_nb2 = _mm_setzero_si128();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_load_si128((__m128i*)pa);
		__m128i b1 = _mm_load_si128((__m128i*)pb);
		__m128i a2 = _mm_load_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_load_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i ps1 = _mm_madd_epi16(a1,b1);
		__m128i ps2 = _mm_madd_epi16(a2,b2);
		__m128i na1 = _mm_madd_epi16(a1,a1);
		__m128i na2 = _mm_madd_epi16(a2,a2);
		__m128i nb1 = _mm_madd_epi16(b1,b1);
		__m128i nb2 = _mm_madd_epi16(b2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc_ps1 = _mm_add_epi32(acc_ps1,ps1);
		acc_ps2 = _mm_add_epi32(acc_ps2,ps2);
		acc_na1 = _mm_add_epi32(acc_na1,na1);
		acc_na2 = _mm_add_epi32(acc_na2,na2);
		acc_nb1 = _mm_add_epi32(acc_nb1,nb1);
		acc_nb2 = _mm_add_epi32(acc_nb2,nb2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
	    }
	}
	/* Somme finale */
	acc = _mm_add_epi32(acc_ps1,acc_ps2);
	acc = _mm_hadd_epi32(acc,acc);
	acc = _mm_hadd_epi32(acc,acc);
	ps = _mm_extract_epi32(acc,0);

	acc = _mm_add_epi32(acc_na1,acc_na2);
	acc = _mm_hadd_epi32(acc,acc);
	acc = _mm_hadd_epi32(acc,acc);
	na = _mm_extract_epi32(acc,0);

	acc = _mm_add_epi32(acc_nb1,acc_nb2);
	acc = _mm_hadd_epi32(acc,acc);
	acc = _mm_hadd_epi32(acc,acc);
	nb = _mm_extract_epi32(acc,0);
    }
    else {
	ps = 0;
	na = 0;
	nb = 0;
    }
    for (k=0;k<r;k++) {
	int a = *pa++;
	int b = *pb++;
	ps += a*b;
	na += a*a;
	nb += b*b;
    }
    norm = sqrt( ((double)na) * ((double)nb) );
    if (norm < 1E-5f)
	return 0;
    return ps / norm;
}
Esempio n. 30
0
void ahd_interpolate_tile(int top, char * buffer)
{
    int row, col, tr, tc, c, val;
    const int dir[4] = { -1, 1, -width, width };
    __m128i ldiff[2], abdiff[2];
    union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer;
    union hvrgbpix *rix;
    union rgbpix * pix;
    union hvrgbpix (*lab)[width];
    short (*lix)[8];
    char (*h**o)[width][2];
    lab  = (union hvrgbpix (*)[width])(buffer + 16*width*TS);
    h**o = (char  (*)[width][2])(buffer + 32*width*TS);

    const int left=2;

    if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) {
        fprintf(stderr, "unaligned buffers defeat speed!\n"); abort();
    }

    /*  Interpolate gren horz&vert, red and blue, and convert to CIELab:  */
    //do the first two rows of green first.
    //then one green, and rgb through the tile.. this because R/B needs down-right green value
    for (row=top; row < top+2 && row < height-2; row++) {
        col = left + (FC(row,left) & 1);
        for (c = FC(row,col); col < width-2; col+=2) {
            pix = (union rgbpix*)image + row*width+col;
            val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2;
            rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2;
            rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g);
        }
    }

    for (; row < top+TS && row < height-2; row++) {
        int rowx = row-1;

        if (FC(rowx,left+1)==1) {
            int c1 = FC(rowx+1,left+1),
                c2 = FC(rowx,left+2);

            pix = (union rgbpix*)image + row*width+left+1;
            rix = &rgb[row-top][1];

            val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2;
            rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2;
            rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g);
            for (col=left+1; col < width-3; col+=2) {
                pix = (union rgbpix*)image + rowx*width+col+1;

                union hvrgbpix rixr, rix0;

                rix = &rgb[rowx-top][col-left]+1;

                signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1];
                signed pix_ul = pix[-width-1].c[c1];
                rixr.vec = _mm_set1_epi16(pix[-1].g);
                signed pix_lr = pix[-2].c[c2] + pix[0].c[c2];
                rix0.h.c[c2] = rix0.v.c[c2]  = pix[0].c[c2];
                pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1;
                signed pix_dl = pix[width-1].c[c1];

                //fully loaded
                __m128i rix_dr =               _mm_setr_epi32(pix[width].g,       pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]);
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1],  pix[width+3].c[c1], pix[width+1].c[c1], 0));
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g,      0,                  pix[2*width+1].g, pix[3*width+1].c[c1]));
                rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1));
                //half loaded
                rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128());
                rix_dr = _mm_srai_epi32(rix_dr,2);
                __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0);
                __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0);
                __m128i m = _mm_min_epi32(a,b);
                __m128i M = _mm_max_epi32(a,b);
                rix_dr = _mm_min_epi32(rix_dr,M);
                rix_dr = _mm_max_epi32(rix_dr,m);

                signed pix_udr = pix_ul + pix_dl;

                signed rix0_ul = rix[-width-1].h.g;
                signed rix1_ul = rix[-width-1].v.g;
                __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0);
                signed rix0_rr = rix[-2].h.g;
                signed rix1_rr = rix[-2].v.g;

                rix0.h.g = rix[0].h.g;
                rix0.v.g = rix[0].v.g;
                signed rix0_dl = rix[width-1].h.g;
                signed rix1_dl = rix[width-1].v.g;

                // fully loaded
                __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr);
                rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g));
                __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr);
                v2 = _mm_sub_epi32(v2, rix_udr);
                v2 = _mm_srai_epi32(v2,1);
                v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec));
                v2 = _mm_max_epi32(v2, _mm_setzero_si128());
                v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff));
                rixr.h.c[c2] = _mm_extract_epi32(v2,2);
                rixr.v.c[c2] = _mm_extract_epi32(v2,3);
                rixr.h.c[c1] = _mm_extract_epi32(v2,0);
                rixr.v.c[c1] = _mm_extract_epi32(v2,1);

                // following only uses 64 bit
                __m128i v1 = _mm_set1_epi32(pix_diag);
                v1 = _mm_sub_epi32(v1, rix_ur);
                v1 = _mm_sub_epi32(v1, rix_dr);
                v1 = _mm_sub_epi32(v1, rix_udr);
                v1 = _mm_srai_epi32(v1,2);
                v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0));
                v1 = _mm_max_epi32(v1, _mm_setzero_si128());
                v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff));
                rix0.h.c[c1] = _mm_extract_epi32(v1,0);
                rix0.v.c[c1] = _mm_extract_epi32(v1,1);


                lab[rowx-top][col-left].vec = cielabv(rixr);
                lab[rowx-top][col-left+1].vec = cielabv(rix0);

                _mm_store_si128(&rix[-1].vec,rixr.vec);
                _mm_store_si128(&rix[0].vec,rix0.vec);

                rix[width+1].h.g = _mm_extract_epi32(rix_dr,0);
                rix[width+1].v.g = _mm_extract_epi32(rix_dr,1);
            }
        } else {