Esempio n. 1
0
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
  uint8_t levels[16], ctxs[16];
  uint16_t abs_levels[16];
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
  CostArrayPtr const costs = res->costs;
  const uint16_t* t = costs[n][ctx0];
  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
  // be missing during the loop.
  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;

  if (res->last < 0) {
    return VP8BitCost(0, p0);
  }

  {   // precompute clamped levels and contexts, packed to 8b.
    const __m128i zero = _mm_setzero_si128();
    const __m128i kCst2 = _mm_set1_epi8(2);
    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
    const __m128i D0 = _mm_sub_epi16(zero, c0);
    const __m128i D1 = _mm_sub_epi16(zero, c1);
    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
    const __m128i E1 = _mm_max_epi16(c1, D1);
    const __m128i F = _mm_packs_epi16(E0, E1);
    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]

    _mm_storeu_si128((__m128i*)&ctxs[0], G);
    _mm_storeu_si128((__m128i*)&levels[0], H);

    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
  }
  for (; n < res->last; ++n) {
    const int ctx = ctxs[n];
    const int level = levels[n];
    const int flevel = abs_levels[n];   // full level
    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
    t = costs[n + 1][ctx];
  }
  // Last coefficient is always non-zero
  {
    const int level = levels[n];
    const int flevel = abs_levels[n];
    assert(flevel != 0);
    cost += VP8LevelFixedCosts[flevel] + t[level];
    if (n < 15) {
      const int b = VP8EncBands[n + 1];
      const int ctx = ctxs[n];
      const int last_p0 = res->prob[b][ctx][0];
      cost += VP8BitCost(0, last_p0);
    }
  }
  return cost;
}
Esempio n. 2
0
static void
clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
    const uint8_t *src2_1)
{
  __m128i xmm1;
  uint8_t max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    uint8_t x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set1_epi8(max);
  for (; n >= 16; n -= 16) {
    __m128i xmm0;
    xmm0 = _mm_loadu_si128((__m128i *)src1);
    xmm0 = _mm_min_epu8(xmm0, xmm1);
    _mm_store_si128((__m128i *)dest, xmm0);
    dest += 16;
    src1 += 16;
  }
  for (; n > 0; n--) {
    uint8_t x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
Esempio n. 3
0
// Denoise a 16x1 vector with a weaker filter.
static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
  // Calculate differences.
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
  // Clamp absolute difference to delta to get the adjustment.
  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
  // Restore the sign and get positive and negative adjustments.
  __m128i padj, nadj;
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);
  // Calculate filtered value.
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Accumulate the adjustments.
  acc_diff = _mm_subs_epi8(acc_diff, padj);
  acc_diff = _mm_adds_epi8(acc_diff, nadj);
  return acc_diff;
}
Esempio n. 4
0
static void SkMorph_SSE2(const SkPMColor* src, SkPMColor* dst, int radius,
                         int width, int height, int srcStride, int dstStride)
{
    const int srcStrideX = direction == kX ? 1 : srcStride;
    const int dstStrideX = direction == kX ? 1 : dstStride;
    const int srcStrideY = direction == kX ? srcStride : 1;
    const int dstStrideY = direction == kX ? dstStride : 1;
    radius = SkMin32(radius, width - 1);
    const SkPMColor* upperSrc = src + radius * srcStrideX;
    for (int x = 0; x < width; ++x) {
        const SkPMColor* lp = src;
        const SkPMColor* up = upperSrc;
        SkPMColor* dptr = dst;
        for (int y = 0; y < height; ++y) {
            __m128i max = type == kDilate ? _mm_setzero_si128() : _mm_set1_epi32(0xFFFFFFFF);
            for (const SkPMColor* p = lp; p <= up; p += srcStrideX) {
                __m128i src_pixel = _mm_cvtsi32_si128(*p);
                max = type == kDilate ? _mm_max_epu8(src_pixel, max) : _mm_min_epu8(src_pixel, max);
            }
            *dptr = _mm_cvtsi128_si32(max);
            dptr += dstStrideY;
            lp += srcStrideY;
            up += srcStrideY;
        }
        if (x >= radius) {
            src += srcStrideX;
        }
        if (x + radius < width - 1) {
            upperSrc += srcStrideX;
        }
        dst += dstStrideX;
    }
}
Esempio n. 5
0
__m128i test_mm_min_epu8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_min_epu8
  // DAG: call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
  //
  // ASM-LABEL: test_mm_min_epu8
  // ASM: pminub
  return _mm_min_epu8(A, B);
}
Esempio n. 6
0
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	if (n < 1)
		return (MLIB_FAILURE);

	mlib_s32 i, ax, az, nstep, n1, n2, n3, xval;
	mlib_u8 *px = (mlib_u8 *)x;
	mlib_s8 *pz = (mlib_s8 *)z;
	__m128i zbuf, xbuf, mask;
	mask = _mm_set1_epi8(127);

	ax = (mlib_addr)x & 15;
	az = (mlib_addr)z & 15;

	nstep = 16 / sizeof (mlib_u8);
	n1 = ((16 - ax) & 15) / sizeof (mlib_u8);
	n2 = (n - n1) / nstep;
	n3 = n - n1 - n2 * nstep;

	if (n2 < 1) {
		for (i = 0; i < n; i++) {
			xval = *px++;
			if (xval > 127)
				xval = 127;
			*pz++ = xval;
		}
	} else {
		for (i = 0; i < n1; i++) {
			xval = *px++;
			if (xval > 127)
				xval = 127;
			*pz++ = xval;
		}

		for (i = 0; i < n2; i++) {
			xbuf = _mm_load_si128((__m128i *)px);
			zbuf = _mm_min_epu8(xbuf, mask);
			_mm_storeu_si128((__m128i *)pz, zbuf);
			px += nstep;
			pz += nstep;
		}

		for (i = 0; i < n3; i++) {
			xval = *px++;
			if (xval > 127)
				xval = 127;
			*pz++ = xval;
		}
	}

	return (MLIB_SUCCESS);
}
Esempio n. 7
0
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const
{
    __m128i** ptrSSE = (__m128i**) ptr;

    // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
    int cnt = 0;
    for( int n = FREAK_NB_PAIRS/128; n-- ; )
    {
        __m128i result128 = _mm_setzero_si128();
        for( int m = 128/16; m--; cnt += 16 )
        {
            __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i],
                                            pointsValue[descriptionPairs[cnt+1].i],
                                            pointsValue[descriptionPairs[cnt+2].i],
                                            pointsValue[descriptionPairs[cnt+3].i],
                                            pointsValue[descriptionPairs[cnt+4].i],
                                            pointsValue[descriptionPairs[cnt+5].i],
                                            pointsValue[descriptionPairs[cnt+6].i],
                                            pointsValue[descriptionPairs[cnt+7].i],
                                            pointsValue[descriptionPairs[cnt+8].i],
                                            pointsValue[descriptionPairs[cnt+9].i],
                                            pointsValue[descriptionPairs[cnt+10].i],
                                            pointsValue[descriptionPairs[cnt+11].i],
                                            pointsValue[descriptionPairs[cnt+12].i],
                                            pointsValue[descriptionPairs[cnt+13].i],
                                            pointsValue[descriptionPairs[cnt+14].i],
                                            pointsValue[descriptionPairs[cnt+15].i]);

            __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j],
                                            pointsValue[descriptionPairs[cnt+1].j],
                                            pointsValue[descriptionPairs[cnt+2].j],
                                            pointsValue[descriptionPairs[cnt+3].j],
                                            pointsValue[descriptionPairs[cnt+4].j],
                                            pointsValue[descriptionPairs[cnt+5].j],
                                            pointsValue[descriptionPairs[cnt+6].j],
                                            pointsValue[descriptionPairs[cnt+7].j],
                                            pointsValue[descriptionPairs[cnt+8].j],
                                            pointsValue[descriptionPairs[cnt+9].j],
                                            pointsValue[descriptionPairs[cnt+10].j],
                                            pointsValue[descriptionPairs[cnt+11].j],
                                            pointsValue[descriptionPairs[cnt+12].j],
                                            pointsValue[descriptionPairs[cnt+13].j],
                                            pointsValue[descriptionPairs[cnt+14].j],
                                            pointsValue[descriptionPairs[cnt+15].j]);

            __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
            workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers

            workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
            result128 = _mm_or_si128(result128, workReg);
        }
        (**ptrSSE) = result128;
        ++(*ptrSSE);
    }
    (*ptrSSE) -= 8;
}
    SIMDValue SIMDUint8x16Operation::OpMin(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        x86Result.m128i_value = _mm_min_epu8(tmpaValue.m128i_value, tmpbValue.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Esempio n. 9
0
__m64 _m_pminub(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_min_epu8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Esempio n. 10
0
static FORCE_INLINE __m128i mm_min_epu(const __m128i &a, const __m128i &b) {
    if (sizeof(PixelType) == 1)
        return _mm_min_epu8(a, b);
    else {
        __m128i word_32768 = _mm_set1_epi16(32768);

        __m128i a_minus = _mm_sub_epi16(a, word_32768);
        __m128i b_minus = _mm_sub_epi16(b, word_32768);

        return _mm_add_epi16(_mm_min_epi16(a_minus, b_minus), word_32768);
    }
}
Esempio n. 11
0
void
exponent_min_sse2(uint8_t *expTarget, uint8_t *exp, uint8_t *exp1, int n)
{
    int i;

    for (i = 0; i < (n & ~15); i += 16) {
        __m128i vexp = _mm_loadu_si128((__m128i*)&exp[i]);
        __m128i vexp1 = _mm_loadu_si128((__m128i*)&exp1[i]);
        vexp = _mm_min_epu8(vexp, vexp1);
        _mm_storeu_si128 ((__m128i*)&expTarget[i], vexp);
    }
    for (; i < n; ++i)
        expTarget[i] = MIN(exp[i], exp1[i]);
}
Esempio n. 12
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;

    line_copy8(p0, srcp + stride, width, 1);
    line_copy8(p1, srcp, width, 1);

    uint8_t threshold = (uint8_t)th;

    __m128i zero = _mm_setzero_si128();
    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = COORDINATES;
        for (int x = 0; x < width; x += 16) {
            __m128i sumlo = zero;
            __m128i sumhi = zero;

            for (int i = 0; i < 8; i++) {
                __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                sumlo  = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero));
                sumhi  = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero));
            }

            sumlo = _mm_srai_epi16(sumlo, 3);
            sumhi = _mm_srai_epi16(sumhi, 3);
            sumlo = _mm_packus_epi16(sumlo, sumhi);

            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i limit = _mm_adds_epu8(src, xth);

            sumlo = _mm_max_epu8(sumlo, src);
            sumlo = _mm_min_epu8(sumlo, limit);

            _mm_store_si128((__m128i *)(dstp + x), sumlo);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Esempio n. 13
0
// Denoise a 16x1 vector.
static INLINE __m128i vp9_denoiser_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
    const __m128i *l21, __m128i acc_diff) {
  // Calculate differences
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  __m128i v_running_avg_y;
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  // Clamp absolute difference to 16 to be used to get mask. Doing this
  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  const __m128i clamped_absdiff =
      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  // Get masks for l2 l1 and l0 adjustments.
  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  // Get adjustments for l2, l1, and l0.
  __m128i adj2 = _mm_and_si128(mask2, *l32);
  const __m128i adj1 = _mm_and_si128(mask1, *l21);
  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  __m128i adj, padj, nadj;

  // Combine the adjustments and get absolute adjustments.
  adj2 = _mm_add_epi8(adj2, adj1);
  adj = _mm_sub_epi8(*l3, adj2);
  adj = _mm_andnot_si128(mask0, adj);
  adj = _mm_or_si128(adj, adj0);

  // Restore the sign and get positive and negative adjustments.
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);

  // Calculate filtered value.
  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Adjustments <=7, and each element in acc_diff can fit in signed
  // char.
  acc_diff = _mm_adds_epi8(acc_diff, padj);
  acc_diff = _mm_subs_epi8(acc_diff, nadj);
  return acc_diff;
}
Esempio n. 14
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th, int *enable)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;
    uint8_t threshold = th > 255 ? 255 : (uint8_t)th;

    line_copy8(p0, srcp, width, 1);
    line_copy8(p1, srcp, width, 1);

    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = {p0 - 1, p0, p0 + 1,
                                  p1 - 1,     p1 + 1,
                                  p2 - 1, p2, p2 + 1};
        for (int x = 0; x < width; x += 16) {
            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i min = src;

            for (int i = 0; i < 8; i++) {
                if (enable[i]) {
                    __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                    min = _mm_min_epu8(target, min);
                }
            }

            __m128i limit = _mm_subs_epu8(src, xth);
            min = _mm_max_epu8(min, limit);
            _mm_store_si128((__m128i *)(dstp + x), min);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Esempio n. 15
0
/// any (*p > 2) is set to be 3
COREARRAY_DLL_DEFAULT void vec_u8_geno_valid(C_UInt8 *p, size_t n)
{
#if defined(COREARRAY_SIMD_SSE2)

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p > 3) *p = 3;

	const __m128i zero  = _mm_setzero_si128();
	const __m128i three = _mm_set1_epi8(3);
	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i*)p);
		__m128i mask = _mm_or_si128(_mm_cmplt_epi8(v, zero),
			_mm_cmplt_epi8(three, v));
		if (_mm_movemask_epi8(mask) > 0)
			_mm_store_si128((__m128i*)p, _mm_min_epu8(v, three));
	}

#endif

	for (; n > 0; n--, p++) if (*p > 3) *p = 3;
}
Esempio n. 16
0
// count genotype sum and number of calls, not requiring 16-aligned p
COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p,
	size_t n, C_Int32 &out_sum, C_Int32 &out_num)
{
	C_Int32 sum=0, num=0;

#if defined(COREARRAY_SIMD_AVX2)

	const __m256i three = _mm256_set1_epi8(3);
	const __m256i zero = _mm256_setzero_si256();
	__m256i sum32 = zero, num32 = zero;
	size_t limit_by_U8 = 0;

	for (; n >= 32; )
	{
		__m256i v = _mm256_loadu_si256((__m256i const*)p);
		p += 32;
		__m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three));
		sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m));
		num32 = _mm256_sub_epi8(num32, m);
		n -= 32;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 32))
		{
			// add to sum
			sum32 = _mm256_sad_epu8(sum32, zero);
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2)));
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1)));
			sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32));
			// add to num
			num32 = _mm256_sad_epu8(num32, zero);
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2)));
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1)));
			num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32));
			// reset
			sum32 = num32 = zero;
			limit_by_U8 = 0;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	// header, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p <= 2) { sum += *p; num++; }

	const __m128i three = _mm_set1_epi8(3);
	const __m128i zero = _mm_setzero_si128();
	__m128i sum16=zero, num16=zero;
	size_t limit_by_U8 = 0;

	for (; n >= 16; )
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		p += 16;
		__m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three));
		sum16 = _mm_add_epi8(sum16, v & m);
		num16 = _mm_sub_epi8(num16, m);
		n -= 16;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 16))
		{
			// add to sum
			sum16 = _mm_sad_epu8(sum16, zero);
			sum += _mm_cvtsi128_si32(sum16);
			sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2));
			// add to num
			num16 = _mm_sad_epu8(num16, zero);
			num += _mm_cvtsi128_si32(num16);
			num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2));
			// reset
			sum16 = num16 = zero;
			limit_by_U8 = 0;
		}
	}

#endif

	for (; n > 0; n--, p++)
		if (*p <= 2) { sum += *p; num++; }
	out_sum = sum;
	out_num = num;
	return p;
}
Esempio n. 17
0
 template <> SIMD_INLINE __m128i OperationBinary8u<SimdOperationBinary8uMinimum>(const __m128i & a, const __m128i & b)
 {
     return _mm_min_epu8(a, b);
 }
Esempio n. 18
0
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                             int mc_avg_y_stride, unsigned char *running_avg_y,
                             int avg_y_stride, unsigned char *sig,
                             int sig_stride, unsigned int motion_magnitude,
                             int increase_denoising) {
  unsigned char *running_avg_y_start = running_avg_y;
  unsigned char *sig_start = sig;
  unsigned int sum_diff_thresh;
  int r;
  int shift_inc =
      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
          ? 1
          : 0;
  __m128i acc_diff = _mm_setzero_si128();
  const __m128i k_0 = _mm_setzero_si128();
  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  const __m128i k_8 = _mm_set1_epi8(8);
  const __m128i k_16 = _mm_set1_epi8(16);
  /* Modify each level's adjustment according to motion_magnitude. */
  const __m128i l3 = _mm_set1_epi8(
      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  /* Difference between level 3 and level 2 is 2. */
  const __m128i l32 = _mm_set1_epi8(2);
  /* Difference between level 2 and level 1 is 1. */
  const __m128i l21 = _mm_set1_epi8(1);

  for (r = 0; r < 16; ++r) {
    /* Calculate differences */
    const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
    const __m128i v_mc_running_avg_y =
        _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
    __m128i v_running_avg_y;
    const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
    const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
    /* Obtain the sign. FF if diff is negative. */
    const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
    /* Clamp absolute difference to 16 to be used to get mask. Doing this
     * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
    const __m128i clamped_absdiff =
        _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
    /* Get masks for l2 l1 and l0 adjustments */
    const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
    const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
    const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
    /* Get adjustments for l2, l1, and l0 */
    __m128i adj2 = _mm_and_si128(mask2, l32);
    const __m128i adj1 = _mm_and_si128(mask1, l21);
    const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
    __m128i adj, padj, nadj;

    /* Combine the adjustments and get absolute adjustments. */
    adj2 = _mm_add_epi8(adj2, adj1);
    adj = _mm_sub_epi8(l3, adj2);
    adj = _mm_andnot_si128(mask0, adj);
    adj = _mm_or_si128(adj, adj0);

    /* Restore the sign and get positive and negative adjustments. */
    padj = _mm_andnot_si128(diff_sign, adj);
    nadj = _mm_and_si128(diff_sign, adj);

    /* Calculate filtered value. */
    v_running_avg_y = _mm_adds_epu8(v_sig, padj);
    v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
    _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

    /* Adjustments <=7, and each element in acc_diff can fit in signed
     * char.
     */
    acc_diff = _mm_adds_epi8(acc_diff, padj);
    acc_diff = _mm_subs_epi8(acc_diff, nadj);

    /* Update pointers for next iteration. */
    sig += sig_stride;
    mc_running_avg_y += mc_avg_y_stride;
    running_avg_y += avg_y_stride;
  }

  {
    /* Compute the sum of all pixel differences of this MB. */
    unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
    sum_diff_thresh = SUM_DIFF_THRESHOLD;
    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
    if (abs_sum_diff > sum_diff_thresh) {
      // Before returning to copy the block (i.e., apply no denoising),
      // check if we can still apply some (weaker) temporal filtering to
      // this block, that would otherwise not be denoised at all. Simplest
      // is to apply an additional adjustment to running_avg_y to bring it
      // closer to sig. The adjustment is capped by a maximum delta, and
      // chosen such that in most cases the resulting sum_diff will be
      // within the acceptable range given by sum_diff_thresh.

      // The delta is set by the excess of absolute pixel diff over the
      // threshold.
      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
      // Only apply the adjustment for max delta up to 3.
      if (delta < 4) {
        const __m128i k_delta = _mm_set1_epi8(delta);
        sig -= sig_stride * 16;
        mc_running_avg_y -= mc_avg_y_stride * 16;
        running_avg_y -= avg_y_stride * 16;
        for (r = 0; r < 16; ++r) {
          __m128i v_running_avg_y =
              _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
          // Calculate differences.
          const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
          const __m128i v_mc_running_avg_y =
              _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
          // Obtain the sign. FF if diff is negative.
          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
          // Clamp absolute difference to delta to get the adjustment.
          const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
          // Restore the sign and get positive and negative adjustments.
          __m128i padj, nadj;
          padj = _mm_andnot_si128(diff_sign, adj);
          nadj = _mm_and_si128(diff_sign, adj);
          // Calculate filtered value.
          v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
          v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
          _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

          // Accumulate the adjustments.
          acc_diff = _mm_subs_epi8(acc_diff, padj);
          acc_diff = _mm_adds_epi8(acc_diff, nadj);

          // Update pointers for next iteration.
          sig += sig_stride;
          mc_running_avg_y += mc_avg_y_stride;
          running_avg_y += avg_y_stride;
        }
        abs_sum_diff = abs_sum_diff_16x1(acc_diff);
        if (abs_sum_diff > sum_diff_thresh) {
          return COPY_BLOCK;
        }
      } else {
        return COPY_BLOCK;
      }
    }
  }
Esempio n. 19
0
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor )
{
    __m128i t0, t1, t3, t4, t6, t7;

    // get bounding box
    // ----------------

    // load the first row
    t0 = _mm_load_si128 ( (__m128i*) colorBlock );
    t1 = _mm_load_si128 ( (__m128i*) colorBlock );

    __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) );
    // Minimum of Packed Unsigned Byte Integers
    t0 = _mm_min_epu8 ( t0, t16);
    // Maximum of Packed Unsigned Byte Integers
    t1 = _mm_max_epu8 ( t1, t16);

    __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) );
    t0 = _mm_min_epu8 ( t0, t32);
    t1 = _mm_max_epu8 ( t1, t32);

    __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) );
    t0 = _mm_min_epu8 ( t0, t48);
    t1 = _mm_max_epu8 ( t1, t48);

    // Shuffle Packed Doublewords
    t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
    t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );

    t0 = _mm_min_epu8 ( t0, t3);
    t1 = _mm_max_epu8 ( t1, t4);

    // Shuffle Packed Low Words
    t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
    t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );

    t0 = _mm_min_epu8 ( t0, t6);
    t1 = _mm_max_epu8 ( t1, t7);

    // inset the bounding box
    // ----------------------

    // Unpack Low Data
    //__m128i t66 = _mm_set1_epi8( 0 );
    __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 );
    t0 = _mm_unpacklo_epi8(t0, t66);
    t1 = _mm_unpacklo_epi8(t1, t66);

    // copy (movdqa)
    //__m128i t2 = _mm_load_si128 ( &t1 );
    __m128i t2 = t1;

    // Subtract Packed Integers
    t2 = _mm_sub_epi16(t2, t0);

    // Shift Packed Data Right Logical
    t2 = _mm_srli_epi16(t2, INSET_SHIFT);

    // Add Packed Integers
    t0 = _mm_add_epi16(t0, t2);

    t1 = _mm_sub_epi16(t1, t2);

    // Pack with Unsigned Saturation
    t0 = _mm_packus_epi16(t0, t0);
    t1 = _mm_packus_epi16(t1, t1);

    // store bounding box extents
    // --------------------------
    _mm_store_si128 ( (__m128i*) minColor, t0 );
    _mm_store_si128 ( (__m128i*) maxColor, t1 );
}
int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
                             YV12_BUFFER_CONFIG *running_avg,
                             MACROBLOCK *signal, unsigned int motion_magnitude,
                             int y_offset, int uv_offset)
{
    unsigned char *sig = signal->thismb;
    int sig_stride = 16;
    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
    int mc_avg_y_stride = mc_running_avg->y_stride;
    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
    int avg_y_stride = running_avg->y_stride;
    int r;
    (void)uv_offset;
    __m128i acc_diff = _mm_setzero_si128();
    const __m128i k_0 = _mm_setzero_si128();
    const __m128i k_4 = _mm_set1_epi8(4);
    const __m128i k_8 = _mm_set1_epi8(8);
    const __m128i k_16 = _mm_set1_epi8(16);
    /* Modify each level's adjustment according to motion_magnitude. */
    const __m128i l3 = _mm_set1_epi8(
                      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
    /* Difference between level 3 and level 2 is 2. */
    const __m128i l32 = _mm_set1_epi8(2);
    /* Difference between level 2 and level 1 is 1. */
    const __m128i l21 = _mm_set1_epi8(1);

    for (r = 0; r < 16; ++r)
    {
        /* Calculate differences */
        const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
        const __m128i v_mc_running_avg_y = _mm_loadu_si128(
                                           (__m128i *)(&mc_running_avg_y[0]));
        __m128i v_running_avg_y;
        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
        /* Obtain the sign. FF if diff is negative. */
        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
        /* Clamp absolute difference to 16 to be used to get mask. Doing this
         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
        const __m128i clamped_absdiff = _mm_min_epu8(
                                        _mm_or_si128(pdiff, ndiff), k_16);
        /* Get masks for l2 l1 and l0 adjustments */
        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
        /* Get adjustments for l2, l1, and l0 */
        __m128i adj2 = _mm_and_si128(mask2, l32);
        const __m128i adj1 = _mm_and_si128(mask1, l21);
        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
        __m128i adj,  padj, nadj;

        /* Combine the adjustments and get absolute adjustments. */
        adj2 = _mm_add_epi8(adj2, adj1);
        adj = _mm_sub_epi8(l3, adj2);
        adj = _mm_andnot_si128(mask0, adj);
        adj = _mm_or_si128(adj, adj0);

        /* Restore the sign and get positive and negative adjustments. */
        padj = _mm_andnot_si128(diff_sign, adj);
        nadj = _mm_and_si128(diff_sign, adj);

        /* Calculate filtered value. */
        v_running_avg_y = _mm_adds_epu8(v_sig, padj);
        v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
        _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

        /* Adjustments <=7, and each element in acc_diff can fit in signed
         * char.
         */
        acc_diff = _mm_adds_epi8(acc_diff, padj);
        acc_diff = _mm_subs_epi8(acc_diff, nadj);

        /* Update pointers for next iteration. */
        sig += sig_stride;
        mc_running_avg_y += mc_avg_y_stride;
        running_avg_y += avg_y_stride;
    }

    {
        /* Compute the sum of all pixel differences of this MB. */
        union sum_union s;
        int sum_diff = 0;
        s.v = acc_diff;
        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];

        if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
        {
            return COPY_BLOCK;
        }
    }

    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
                      signal->thismb, sig_stride);
    return FILTER_BLOCK;
}
Esempio n. 21
0
void SGMStereo::calcPixelwiseSAD(const unsigned char* leftSobelRow, const unsigned char* rightSobelRow) {
	calcHalfPixelRight(rightSobelRow);

	for (int x = 0; x < 16; ++x) {
		int leftCenterValue = leftSobelRow[x];
		int leftHalfLeftValue = x > 0 ? (leftCenterValue + leftSobelRow[x - 1])/2 : leftCenterValue;
		int leftHalfRightValue = x < width_ - 1 ? (leftCenterValue + leftSobelRow[x + 1])/2 : leftCenterValue;
		int leftMinValue = std::min(leftHalfLeftValue, leftHalfRightValue);
		leftMinValue = std::min(leftMinValue, leftCenterValue);
		int leftMaxValue = std::max(leftHalfLeftValue, leftHalfRightValue);
		leftMaxValue = std::max(leftMaxValue, leftCenterValue);

		for (int d = 0; d <= x; ++d) {
			int rightCenterValue = rightSobelRow[width_ - 1 - x + d];
			int rightMinValue = halfPixelRightMin_[width_ - 1 - x + d];
			int rightMaxValue = halfPixelRightMax_[width_ - 1 - x + d];

			int costLtoR = std::max(0, leftCenterValue - rightMaxValue);
			costLtoR = std::max(costLtoR, rightMinValue - leftCenterValue);
			int costRtoL = std::max(0, rightCenterValue - leftMaxValue);
			costRtoL = std::max(costRtoL, leftMinValue - rightCenterValue);
			int costValue = std::min(costLtoR, costRtoL);

			pixelwiseCostRow_[disparityTotal_*x + d] = costValue;
		}
		for (int d = x + 1; d < disparityTotal_; ++d) {
			pixelwiseCostRow_[disparityTotal_*x + d] = pixelwiseCostRow_[disparityTotal_*x + d - 1];
		}
	}
	for (int x = 16; x < disparityTotal_; ++x) {
		int leftCenterValue = leftSobelRow[x];
		int leftHalfLeftValue = x > 0 ? (leftCenterValue + leftSobelRow[x - 1])/2 : leftCenterValue;
		int leftHalfRightValue = x < width_ - 1 ? (leftCenterValue + leftSobelRow[x + 1])/2 : leftCenterValue;
		int leftMinValue = std::min(leftHalfLeftValue, leftHalfRightValue);
		leftMinValue = std::min(leftMinValue, leftCenterValue);
		int leftMaxValue = std::max(leftHalfLeftValue, leftHalfRightValue);
		leftMaxValue = std::max(leftMaxValue, leftCenterValue);

		__m128i registerLeftCenterValue = _mm_set1_epi8(static_cast<char>(leftCenterValue));
		__m128i registerLeftMinValue = _mm_set1_epi8(static_cast<char>(leftMinValue));
		__m128i registerLeftMaxValue = _mm_set1_epi8(static_cast<char>(leftMaxValue));

		for (int d = 0; d < x/16; d += 16) {
			__m128i registerRightCenterValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rightSobelRow + width_ - 1 - x + d));
			__m128i registerRightMinValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMin_ + width_ - 1 - x + d));
			__m128i registerRightMaxValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMax_ + width_ - 1 - x + d));

			__m128i registerCostLtoR = _mm_max_epu8(_mm_subs_epu8(registerLeftCenterValue, registerRightMaxValue),
													_mm_subs_epu8(registerRightMinValue, registerLeftCenterValue));
			__m128i registerCostRtoL = _mm_max_epu8(_mm_subs_epu8(registerRightCenterValue, registerLeftMaxValue),
													_mm_subs_epu8(registerLeftMinValue, registerRightCenterValue));
			__m128i registerCost = _mm_min_epu8(registerCostLtoR, registerCostRtoL);

			_mm_store_si128(reinterpret_cast<__m128i*>(pixelwiseCostRow_ + disparityTotal_*x + d), registerCost);
		}
		for (int d = x/16; d <= x; ++d) {
			int rightCenterValue = rightSobelRow[width_ - 1 - x + d];
			int rightMinValue = halfPixelRightMin_[width_ - 1 - x + d];
			int rightMaxValue = halfPixelRightMax_[width_ - 1 - x + d];

			int costLtoR = std::max(0, leftCenterValue - rightMaxValue);
			costLtoR = std::max(costLtoR, rightMinValue - leftCenterValue);
			int costRtoL = std::max(0, rightCenterValue - leftMaxValue);
			costRtoL = std::max(costRtoL, leftMinValue - rightCenterValue);
			int costValue = std::min(costLtoR, costRtoL);

			pixelwiseCostRow_[disparityTotal_*x + d] = costValue;
		}
		for (int d = x + 1; d < disparityTotal_; ++d) {
			pixelwiseCostRow_[disparityTotal_*x + d] = pixelwiseCostRow_[disparityTotal_*x + d - 1];
		}
	}
	for (int x = disparityTotal_; x < width_; ++x) {
		int leftCenterValue = leftSobelRow[x];
		int leftHalfLeftValue = x > 0 ? (leftCenterValue + leftSobelRow[x - 1])/2 : leftCenterValue;
		int leftHalfRightValue = x < width_ - 1 ? (leftCenterValue + leftSobelRow[x + 1])/2 : leftCenterValue;
		int leftMinValue = std::min(leftHalfLeftValue, leftHalfRightValue);
		leftMinValue = std::min(leftMinValue, leftCenterValue);
		int leftMaxValue = std::max(leftHalfLeftValue, leftHalfRightValue);
		leftMaxValue = std::max(leftMaxValue, leftCenterValue);

		__m128i registerLeftCenterValue = _mm_set1_epi8(static_cast<char>(leftCenterValue));
		__m128i registerLeftMinValue = _mm_set1_epi8(static_cast<char>(leftMinValue));
		__m128i registerLeftMaxValue = _mm_set1_epi8(static_cast<char>(leftMaxValue));

		for (int d = 0; d < disparityTotal_; d += 16) {
			__m128i registerRightCenterValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rightSobelRow + width_ - 1 - x + d));
			__m128i registerRightMinValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMin_ + width_ - 1 - x + d));
			__m128i registerRightMaxValue = _mm_loadu_si128(reinterpret_cast<const __m128i*>(halfPixelRightMax_ + width_ - 1 - x + d));

			__m128i registerCostLtoR = _mm_max_epu8(_mm_subs_epu8(registerLeftCenterValue, registerRightMaxValue),
													_mm_subs_epu8(registerRightMinValue, registerLeftCenterValue));
			__m128i registerCostRtoL = _mm_max_epu8(_mm_subs_epu8(registerRightCenterValue, registerLeftMaxValue),
													_mm_subs_epu8(registerLeftMinValue, registerRightCenterValue));
			__m128i registerCost = _mm_min_epu8(registerCostLtoR, registerCostRtoL);

			_mm_store_si128(reinterpret_cast<__m128i*>(pixelwiseCostRow_ + disparityTotal_*x + d), registerCost);
		}
	}
}
Esempio n. 22
0
File: cgp_sse.c Progetto: kacer/coco
/**
 * Calculate output of given chromosome and inputs using SSE instructions
 * @param chr
 * @param inputs
 * @param outputs
 */
void cgp_get_output_sse(ga_chr_t chromosome,
    __m128i_aligned inputs[CGP_INPUTS], __m128i_aligned outputs[CGP_OUTPUTS])
{
#ifdef SSE2
    assert(CGP_OUTPUTS == 1);
    assert(CGP_ROWS == 4);
    assert(CGP_LBACK == 1);

    // previous and currently computed column
    register __m128i prev0, prev1, prev2, prev3;
    register __m128i current0, current1, current2, current3;

    // 0xFF constant
    static __m128i_aligned FF;
    FF = _mm_set1_epi8(0xFF);

    cgp_genome_t genome = (cgp_genome_t) chromosome->genome;

    /* if primary output is connected to primary input, skip evaluation

    This cannot happen - CGP does not generate circuits like that

    if (genome->outputs[0] < CGP_INPUTS) {
        int i = genome->outputs[0];
        _mm_store_si128(&outputs[0], inputs[i]);
        return;
    }
    */

#ifdef TEST_EVAL_SSE2
    for (int i = 0; i < CGP_INPUTS; i++) {
        unsigned char *_tmp = (unsigned char*) &inputs[i];
        printf("I: %2d = " UCFMT16 "\n", i, UCVAL16(0));
    }
#endif

    int offset = -CGP_ROWS;

    for (int x = 0; x < CGP_COLS; x++) {
        for (int y = 0; y < CGP_ROWS; y++) {
            int idx = cgp_node_index(x, y);
            cgp_node_t *n = &(genome->nodes[idx]);

            // skip inactive blocks
            if (!n->is_active) continue;

            register __m128i A;
            register __m128i B;
            register __m128i Y;
            register __m128i TMP;
            register __m128i mask;

            LOAD_INPUT(A, n->inputs[0]);
            LOAD_INPUT(B, n->inputs[1]);

            switch (n->function) {
                case c255:
                    Y = FF;
                    break;

                case identity:
                    Y = A;
                    break;

                case inversion:
                    Y = _mm_sub_epi8(FF, A);
                    break;

                case b_or:
                    Y = _mm_or_si128(A, B);
                    break;

                case b_not1or2:
                    // we don't have NOT instruction, we need to XOR with FF
                    Y = _mm_xor_si128(FF, A);
                    Y = _mm_or_si128(Y, B);
                    break;

                case b_and:
                    Y = _mm_and_si128(A, B);
                    break;

                case b_nand:
                    Y = _mm_and_si128(A, B);
                    Y = _mm_xor_si128(FF, Y);
                    break;

                case b_xor:
                    Y = _mm_xor_si128(A, B);
                    break;

                case rshift1:
                    // no SR instruction for 8bit data, we need to shift
                    // 16 bits and apply mask
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // SHR: [ 0 1 2 3 4 5 6 7 | 8 A B C D E F G]
                    // MSK: [ 0 1 2 3 4 5 6 7 | 0 A B C D E F G]
                    mask = _mm_set1_epi8(0x7F);
                    Y = _mm_srli_epi16(A, 1);
                    Y = _mm_and_si128(Y, mask);
                    break;

                case rshift2:
                    // similar to rshift1
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // SHR: [ 0 0 1 2 3 4 5 6 | 7 8 A B C D E F]
                    // MSK: [ 0 0 1 2 3 4 5 6 | 0 0 A B C D E F]
                    mask = _mm_set1_epi8(0x3F);
                    Y = _mm_srli_epi16(A, 2);
                    Y = _mm_and_si128(Y, mask);
                    break;

                case swap:
                    // SWAP(A, B) (((A & 0x0F) << 4) | ((B & 0x0F)))
                    // Shift A left by 4 bits
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // SHL: [ 5 6 7 8 A B C D | E F G H 0 0 0 0]
                    // MSK: [ 5 6 7 8 0 0 0 0 | E F G H 0 0 0 0]
                    mask = _mm_set1_epi8(0xF0);
                    TMP = _mm_slli_epi16(A, 4);
                    TMP = _mm_and_si128(TMP, mask);

                    // Mask B
                    // IN : [ 1 2 3 4 5 6 7 8 | A B C D E F G H]
                    // MSK: [ 0 0 0 0 5 6 7 8 | 0 0 0 0 E F G H]
                    mask = _mm_set1_epi8(0x0F);
                    Y = _mm_and_si128(B, mask);

                    // Combine
                    Y = _mm_or_si128(Y, TMP);
                    break;

                case add:
                    Y = _mm_add_epi8(A, B);
                    break;

                case add_sat:
                    Y = _mm_adds_epu8(A, B);
                    break;

                case avg:
                    // shift right first, then add, to avoid overflow
                    mask = _mm_set1_epi8(0x7F);
                    TMP = _mm_srli_epi16(A, 1);
                    TMP = _mm_and_si128(TMP, mask);

                    Y = _mm_srli_epi16(B, 1);
                    Y = _mm_and_si128(Y, mask);

                    Y = _mm_add_epi8(Y, TMP);
                    break;

                case max:
                    Y = _mm_max_epu8(A, B);
                    break;

                case min:
                    Y = _mm_min_epu8(A, B);
                    break;
            }


#ifdef TEST_EVAL_SSE2
            __m128i _tmpval = Y;
            unsigned char *_tmp = (unsigned char*) &_tmpval;
            printf("N: %2d = " UCFMT16 "\n", idx + CGP_INPUTS, UCVAL16(0));

            bool mismatch = false;
            for (int i = 1; i < 16; i++) {
                if (_tmp[i] != _tmp[0]) {
                    fprintf(stderr,
                        "Value mismatch on index %2d (%u instead of %u)\n",
                        i, _tmp[i], _tmp[0]);
                    mismatch = true;
                }
            }
            if (mismatch) {
                abort();
            }
#endif

            if (idx + CGP_INPUTS == genome->outputs[0]) {
                _mm_store_si128(&outputs[0], Y);
#ifndef TEST_EVAL_SSE2
                return;
#endif
            }

            ASSIGN_CURRENT(y, Y);

        } // end of column

        offset += CGP_ROWS;
        prev0 = current0;
        prev1 = current1;
        prev2 = current2;
        prev3 = current3;
    } // end of row

#ifdef TEST_EVAL_SSE2
    for (int i = 0; i < CGP_OUTPUTS; i++) {
        unsigned char *_tmp = (unsigned char*) &outputs[i];
        printf("O: %2d = " UCFMT16 "\n", i, UCVAL16(0));
    }
#endif


#endif
}
Esempio n. 23
0
/// Element-wise minimum.
/// @ingroup SIMD
inline xmm_u8 min(const xmm_u8 &a, const xmm_u8 &b) { return _mm_min_epu8(a, b); }
Esempio n. 24
0
void
encode_exp_blk_ch_sse2(uint8_t *exp, int ncoefs, int exp_strategy)
{
    int grpsize, ngrps, i, k, exp_min1, exp_min2;
    uint8_t v;

    ngrps = nexpgrptab[exp_strategy-1][ncoefs] * 3;
    grpsize = exp_strategy + (exp_strategy == EXP_D45);

    // for D15 strategy, there is no need to group/ungroup exponents
    switch (grpsize) {
    case 1: {
        // constraint for DC exponent
        exp[0] = MIN(exp[0], 15);

        // Decrease the delta between each groups to within 2
        // so that they can be differentially encoded
        for (i = 1; i <= ngrps; i++)
            exp[i] = MIN(exp[i], exp[i-1]+2);
        for (i = ngrps-1; i >= 0; i--)
            exp[i] = MIN(exp[i], exp[i+1]+2);

        return;
    }
    // for each group, compute the minimum exponent
    case 2: {
        ALIGN16(uint16_t) exp1[256];
        ALIGN16(const union __m128iui) vmask = {{0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff}};

        i=0; k=1;
        for(; i < (ngrps & ~7); i += 8, k += 16) {
            __m128i v1 = _mm_loadu_si128((__m128i*)&exp[k]);
            __m128i v2 = _mm_srli_si128(v1, 1);
            v1 = _mm_and_si128(v1, vmask.v);
            v1 = _mm_min_epu8(v1, v2);
            _mm_store_si128((__m128i*)&exp1[i], v1);
        }
        switch (ngrps & 7) {
        case 7:
            exp1[i] = MIN(exp[k], exp[k+1]);
            ++i;
            k += 2;
        case 6:
            exp1[i] = MIN(exp[k], exp[k+1]);
            ++i;
            k += 2;
        case 5:
            exp1[i] = MIN(exp[k], exp[k+1]);
            ++i;
            k += 2;
        case 4:
            exp1[i] = MIN(exp[k], exp[k+1]);
            ++i;
            k += 2;
        case 3:
            exp1[i] = MIN(exp[k], exp[k+1]);
            ++i;
            k += 2;
        case 2:
            exp1[i] = MIN(exp[k], exp[k+1]);
            ++i;
            k += 2;
        case 1:
            exp1[i] = MIN(exp[k], exp[k+1]);
        case 0:
            ;
        }
        // constraint for DC exponent
        exp[0] = MIN(exp[0], 15);
        // Decrease the delta between each groups to within 2
        // so that they can be differentially encoded
        exp1[0] = MIN(exp1[0], (uint16_t)exp[0]+2);
        for (i = 1; i < ngrps; i++)
            exp1[i] = MIN(exp1[i], exp1[i-1]+2);
        for (i = ngrps-2; i >= 0; i--)
            exp1[i] = MIN(exp1[i], exp1[i+1]+2);
        // now we have the exponent values the decoder will see
        exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately

        i=0; k=1;
        for (; i < (ngrps & ~7); i += 8, k += 16) {
            __m128i v1 = _mm_load_si128((__m128i*)&exp1[i]);
            __m128i v2 = _mm_slli_si128(v1, 1);
            v1 = _mm_or_si128(v1, v2);
            _mm_storeu_si128((__m128i*)&exp[k], v1);
        }
        switch (ngrps & 7) {
        case 7:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            ++i;
            k += 2;
        case 6:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            ++i;
            k += 2;
        case 5:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            ++i;
            k += 2;
        case 4:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            ++i;
            k += 2;
        case 3:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            ++i;
            k += 2;
        case 2:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            ++i;
            k += 2;
        case 1:
            v = (uint8_t)exp1[i];
            exp[k] = v;
            exp[k+1] = v;
        case 0:
            ;
        }
        return;
        }
    default: {
        ALIGN16(uint32_t) exp1[256];
        ALIGN16(const union __m128iui) vmask2 = {{0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff}};

        i=0; k=1;
        for (; i < (ngrps & ~3); i += 4, k += 16) {
            __m128i v1 = _mm_loadu_si128((__m128i*)&exp[k]);
            __m128i v2 = _mm_srli_si128(v1, 1);
            v1 = _mm_min_epu8(v1, v2);
            v2 = _mm_srli_si128(v1, 2);
            v1 = _mm_min_epu8(v1, v2);
            v1 = _mm_and_si128(v1, vmask2.v);
            _mm_store_si128((__m128i*)&exp1[i], v1);
        }
        switch (ngrps & 3) {
        case 3:
            exp_min1 = MIN(exp[k  ], exp[k+1]);
            exp_min2 = MIN(exp[k+2], exp[k+3]);
            exp1[i]  = MIN(exp_min1, exp_min2);
            ++i;
            k += 4;
        case 2:
            exp_min1 = MIN(exp[k  ], exp[k+1]);
            exp_min2 = MIN(exp[k+2], exp[k+3]);
            exp1[i]  = MIN(exp_min1, exp_min2);
            ++i;
            k += 4;
        case 1:
            exp_min1 = MIN(exp[k  ], exp[k+1]);
            exp_min2 = MIN(exp[k+2], exp[k+3]);
            exp1[i]  = MIN(exp_min1, exp_min2);
        case 0:
            ;
        }
        // constraint for DC exponent
        exp[0] = MIN(exp[0], 15);
        // Decrease the delta between each groups to within 2
        // so that they can be differentially encoded
        exp1[0] = MIN(exp1[0], (uint32_t)exp[0]+2);
        for (i = 1; i < ngrps; i++)
            exp1[i] = MIN(exp1[i], exp1[i-1]+2);
        for (i = ngrps-2; i >= 0; i--)
            exp1[i] = MIN(exp1[i], exp1[i+1]+2);
        // now we have the exponent values the decoder will see
        exp[0] = MIN(exp[0], exp1[0]+2); // DC exponent is handled separately

        i=0; k=1;
        for (; i < (ngrps & ~3); i += 4, k += 16) {
            __m128i v1 = _mm_load_si128((__m128i*)&exp1[i]);
            __m128i v2 = _mm_slli_si128(v1, 1);
            v1 = _mm_or_si128(v1, v2);
            v2 = _mm_slli_si128(v1, 2);
            v1 = _mm_or_si128(v1, v2);
            _mm_storeu_si128((__m128i*)&exp[k], v1);
        }
        switch (ngrps & 3) {
        case 3:
            v = exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            exp[k+2] = v;
            exp[k+3] = v;
            ++i;
            k += 4;
        case 2:
            v = exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            exp[k+2] = v;
            exp[k+3] = v;
            ++i;
            k += 4;
        case 1:
            v = exp1[i];
            exp[k] = v;
            exp[k+1] = v;
            exp[k+2] = v;
            exp[k+3] = v;
        case 0:
            ;
        }
        return;
    }
    }
}
Esempio n. 25
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, edge_t *eh,
               uint16_t plane_max)
{
    uint8_t* p0 = buff + 16;
    uint8_t* p1 = p0 + bstride;
    uint8_t* p2 = p1 + bstride;
    uint8_t* p3 = p2 + bstride;
    uint8_t* p4 = p3 + bstride;
    uint8_t* orig = p0;
    uint8_t* end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min;
    uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max;

    __m128i zero = _mm_setzero_si128();
    __m128i ab = _mm_set1_epi16(15);
    __m128i max = _mm_set1_epi8((int8_t)th_max);
    __m128i min = _mm_set1_epi8((int8_t)th_min);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint8_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 16) {
            __m128i sumx[2] = {zero, zero};
            __m128i sumy[2] = {zero, zero};

            for (int i = 0; i < 4; i++) {
                __m128i xmm0, xmm1, xmul;
                xmul = _mm_load_si128((__m128i *)ar_mulx[i]);
                xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul));
                sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul));

                xmul = _mm_load_si128((__m128i *)ar_muly[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul));
                sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul));
            }

            for (int i = 0; i < 2; i++) {
                __m128i xmax, xmin, mull, mulh;
                sumx[i] = mm_abs_epi16(sumx[i]);
                sumy[i] = mm_abs_epi16(sumy[i]);
                xmax = _mm_max_epi16(sumx[i], sumy[i]);
                xmin = _mm_min_epi16(sumx[i], sumy[i]);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4);
                xmax = mm_cast_epi32(mull, mulh);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5);
                xmin = mm_cast_epi32(mull, mulh);

                sumx[i] = _mm_adds_epu16(xmax, xmin);
                sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift);
            }

            __m128i out = _mm_packus_epi16(sumx[0], sumx[1]);
            __m128i temp = _mm_min_epu8(out, max);
            temp = _mm_cmpeq_epi8(temp, max);
            out = _mm_or_si128(temp, out);

            temp = _mm_max_epu8(out, min);
            temp = _mm_cmpeq_epi8(temp, min);
            out = _mm_andnot_si128(temp, out);

            _mm_store_si128((__m128i*)(dstp + x), out);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}