Example #1
0
void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
{
    int16x8_t zero = vdupq_n_s16(0);
    int16x8_t max = vdupq_n_s16(255);
    int16x8_t y_add = vdupq_n_s16(128);

    int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer;
    int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
    int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;

    int i;
    for (i = 0; i < 4096 / 8; i++)
    {
        int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
        y = vaddq_s16(y, y_add);

        int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]);

        // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
        int16x8_t r = vaddq_s16(y, cr);
        r = vaddq_s16(r, vshrq_n_s16(cr, 2));
        r = vaddq_s16(r, vshrq_n_s16(cr, 3));
        r = vaddq_s16(r, vshrq_n_s16(cr, 5));
        r = vminq_s16(vmaxq_s16(r, zero), max);
        vst1q_s16((sint16*)&y_r_buf[i], r);

        // cb = cb_g_buf[i];
        int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]);

        // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
        int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
        g = vsubq_s16(g, vshrq_n_s16(cb, 4));
        g = vsubq_s16(g, vshrq_n_s16(cb, 5));
        g = vsubq_s16(g, vshrq_n_s16(cr, 1));
        g = vsubq_s16(g, vshrq_n_s16(cr, 3));
        g = vsubq_s16(g, vshrq_n_s16(cr, 4));
        g = vsubq_s16(g, vshrq_n_s16(cr, 5));
        g = vminq_s16(vmaxq_s16(g, zero), max);
        vst1q_s16((sint16*)&cb_g_buf[i], g);

        // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
        int16x8_t b = vaddq_s16(y, cb);
        b = vaddq_s16(b, vshrq_n_s16(cb, 1));
        b = vaddq_s16(b, vshrq_n_s16(cb, 2));
        b = vaddq_s16(b, vshrq_n_s16(cb, 6));
        b = vminq_s16(vmaxq_s16(b, zero), max);
        vst1q_s16((sint16*)&cr_b_buf[i], b);
    }

}
Example #2
0
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
                                     uint16_t* dst, int len) {
  int i;
  const int16x8_t zero = vdupq_n_s16(0);
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  uint64x2_t sum = vdupq_n_u64(0);
  uint64_t diff;

  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
    const int16x8_t D = vsubq_s16(A, B);       // diff_y
    const int16x8_t F = vaddq_s16(C, D);       // new_y
    const uint16x8_t H =
        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
    vst1q_u16(dst + i, H);
    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
  }
  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
  for (; i < len; ++i) {
    const int diff_y = ref[i] - src[i];
    const int new_y = (int)(dst[i]) + diff_y;
    dst[i] = clip_y(new_y);
    diff += (uint64_t)(abs(diff_y));
  }
  return diff;
}
Example #3
0
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
                                   const uint16_t* best_y, uint16_t* out) {
  int i;
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  const int16x8_t zero = vdupq_n_s16(0);
  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t a0 = vld1q_s16(A + i + 0);
    const int16x8_t a1 = vld1q_s16(A + i + 1);
    const int16x8_t b0 = vld1q_s16(B + i + 0);
    const int16x8_t b1 = vld1q_s16(B + i + 1);
    const int16x8_t a0b1 = vaddq_s16(a0, b1);
    const int16x8_t a1b0 = vaddq_s16(a1, b0);
    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
    const int16x8_t d0 = vaddq_s16(c1, a0);
    const int16x8_t d1 = vaddq_s16(c0, a1);
    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
    const int16x8x2_t f = vzipq_s16(e0, e1);
    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
  }
  for (; i < len; ++i) {
    const int a0b1 = A[i + 0] + B[i + 1];
    const int a1b0 = A[i + 1] + B[i + 0];
    const int a0a1b0b1 = a0b1 + a1b0 + 8;
    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
  }
}
static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
                                                     const int stride,
                                                     const int16x8_t res,
                                                     const int16x8_t max) {
  const uint16x8_t a0 = vld1q_u16(*dest);
  const uint16x8_t a1 = vld1q_u16(*dest + 8);
  const uint16x8_t a2 = vld1q_u16(*dest + 16);
  const uint16x8_t a3 = vld1q_u16(*dest + 24);
  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
  const int16x8_t c0 = vminq_s16(b0, max);
  const int16x8_t c1 = vminq_s16(b1, max);
  const int16x8_t c2 = vminq_s16(b2, max);
  const int16x8_t c3 = vminq_s16(b3, max);
  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
  *dest += stride;
}
Example #5
0
// res is in reverse row order
static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
                                                const int stride,
                                                const int16x8_t res,
                                                const int16x8_t max) {
  const uint16x4_t a0 = vld1_u16(*dest);
  const uint16x4_t a1 = vld1_u16(*dest + stride);
  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0));
  // Note: In some profile tests, res is quite close to +/-32767.
  // We use saturating addition.
  const int16x8_t b = vqaddq_s16(res, a);
  const int16x8_t c = vminq_s16(b, max);
  const uint16x8_t d = vqshluq_n_s16(c, 0);
  vst1_u16(*dest, vget_high_u16(d));
  *dest += stride;
  vst1_u16(*dest, vget_low_u16(d));
  *dest += stride;
}
Example #6
0
static void
thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const short* src = _src.ptr<short>();
    short* dst = _dst.ptr<short>();
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
        src_step = dst_step = roi.width;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
        IppiSize sz = { roi.width, roi.height };
        CV_SUPPRESS_DEPRECATED_START
        switch( type )
        {
        case THRESH_TRUNC:
#ifndef HAVE_IPP_ICV_ONLY
            if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
#endif
            if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0)
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        case THRESH_TOZERO:
#ifndef HAVE_IPP_ICV_ONLY
            if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0)
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
#endif
            if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0)
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        case THRESH_TOZERO_INV:
#ifndef HAVE_IPP_ICV_ONLY
            if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
#endif
            if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0)
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        }
        CV_SUPPRESS_DEPRECATED_END
    }
#endif

    switch( type )
    {
    case THRESH_BINARY:
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
        #if CV_SSE2
            if( useSIMD )
            {
                __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
                for( ; j <= roi.width - 16; j += 16 )
                {
                    __m128i v0, v1;
                    v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
                    v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
                    v0 = _mm_cmpgt_epi16( v0, thresh8 );
                    v1 = _mm_cmpgt_epi16( v1, thresh8 );
                    v0 = _mm_and_si128( v0, maxval8 );
                    v1 = _mm_and_si128( v1, maxval8 );
                    _mm_storeu_si128((__m128i*)(dst + j), v0 );
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
        #elif CV_NEON
            int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);

            for( ; j <= roi.width - 8; j += 8 )
            {
                uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh);
                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
            }
        #endif

            for( ; j < roi.width; j++ )
                dst[j] = src[j] > thresh ? maxval : 0;
        }
        break;

    case THRESH_BINARY_INV:
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
        #if CV_SSE2
            if( useSIMD )
            {
                __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval);
                for( ; j <= roi.width - 16; j += 16 )
                {
                    __m128i v0, v1;
                    v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
                    v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
                    v0 = _mm_cmpgt_epi16( v0, thresh8 );
                    v1 = _mm_cmpgt_epi16( v1, thresh8 );
                    v0 = _mm_andnot_si128( v0, maxval8 );
                    v1 = _mm_andnot_si128( v1, maxval8 );
                    _mm_storeu_si128((__m128i*)(dst + j), v0 );
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
        #elif CV_NEON
            int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);

            for( ; j <= roi.width - 8; j += 8 )
            {
                uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh);
                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
            }
        #endif

            for( ; j < roi.width; j++ )
                dst[j] = src[j] <= thresh ? maxval : 0;
        }
        break;

    case THRESH_TRUNC:
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
        #if CV_SSE2
            if( useSIMD )
            {
                __m128i thresh8 = _mm_set1_epi16(thresh);
                for( ; j <= roi.width - 16; j += 16 )
                {
                    __m128i v0, v1;
                    v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
                    v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
                    v0 = _mm_min_epi16( v0, thresh8 );
                    v1 = _mm_min_epi16( v1, thresh8 );
                    _mm_storeu_si128((__m128i*)(dst + j), v0 );
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
        #elif CV_NEON
            int16x8_t v_thresh = vdupq_n_s16(thresh);

            for( ; j <= roi.width - 8; j += 8 )
                vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh));
        #endif

            for( ; j < roi.width; j++ )
                dst[j] = std::min(src[j], thresh);
        }
        break;

    case THRESH_TOZERO:
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
        #if CV_SSE2
            if( useSIMD )
            {
                __m128i thresh8 = _mm_set1_epi16(thresh);
                for( ; j <= roi.width - 16; j += 16 )
                {
                    __m128i v0, v1;
                    v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
                    v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
                    v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8));
                    v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8));
                    _mm_storeu_si128((__m128i*)(dst + j), v0 );
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
        #elif CV_NEON
            int16x8_t v_thresh = vdupq_n_s16(thresh);

            for( ; j <= roi.width - 8; j += 8 )
            {
                int16x8_t v_src = vld1q_s16(src + j);
                uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh);
                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
            }
        #endif

            for( ; j < roi.width; j++ )
            {
                short v = src[j];
                dst[j] = v > thresh ? v : 0;
            }
        }
        break;

    case THRESH_TOZERO_INV:
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
        #if CV_SSE2
            if( useSIMD )
            {
                __m128i thresh8 = _mm_set1_epi16(thresh);
                for( ; j <= roi.width - 16; j += 16 )
                {
                    __m128i v0, v1;
                    v0 = _mm_loadu_si128( (const __m128i*)(src + j) );
                    v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) );
                    v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0);
                    v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1);
                    _mm_storeu_si128((__m128i*)(dst + j), v0 );
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
        #elif CV_NEON
            int16x8_t v_thresh = vdupq_n_s16(thresh);

            for( ; j <= roi.width - 8; j += 8 )
            {
                int16x8_t v_src = vld1q_s16(src + j);
                uint16x8_t v_mask = vcleq_s16(v_src, v_thresh);
                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
            }
        #endif
            for( ; j < roi.width; j++ )
            {
                short v = src[j];
                dst[j] = v <= thresh ? v : 0;
            }
        }
        break;
    default:
        return CV_Error( CV_StsBadArg, "" );
    }
}
Example #7
0
PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
	const INT16 *pSrc[3],
	int srcStep,
	INT16 *pDst[3],
	int dstStep,
	const prim_size_t *roi)	/* region of interest */
{
	/* TODO: If necessary, check alignments and call the general version. */

	int16x8_t zero = vdupq_n_s16(0);
	int16x8_t max = vdupq_n_s16(255);
	int16x8_t y_add = vdupq_n_s16(128);

	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
	int16x8_t* b_buf  = (int16x8_t*) pDst[2];

	int srcbump = srcStep / sizeof(int16x8_t);
	int dstbump = dstStep / sizeof(int16x8_t);
	int yp;

	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
	for (yp=0; yp<roi->height; ++yp)
	{
		int i;
		for (i=0; i<imax; i++)
		{
			int16x8_t y = vld1q_s16((INT16*) (y_buf+i));
			y = vaddq_s16(y, y_add);

			int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i));

			/* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)),
			 *    0, 255);
			 */
			int16x8_t r = vaddq_s16(y, cr);
			r = vaddq_s16(r, vshrq_n_s16(cr, 2));
			r = vaddq_s16(r, vshrq_n_s16(cr, 3));
			r = vaddq_s16(r, vshrq_n_s16(cr, 5));
			r = vminq_s16(vmaxq_s16(r, zero), max);
			vst1q_s16((INT16*) (r_buf+i), r);

			/* cb = cb_g_buf[i]; */
			int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i));

			/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1)
			 * - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
			 */
			int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
			g = vsubq_s16(g, vshrq_n_s16(cb, 4));
			g = vsubq_s16(g, vshrq_n_s16(cb, 5));
			g = vsubq_s16(g, vshrq_n_s16(cr, 1));
			g = vsubq_s16(g, vshrq_n_s16(cr, 3));
			g = vsubq_s16(g, vshrq_n_s16(cr, 4));
			g = vsubq_s16(g, vshrq_n_s16(cr, 5));
			g = vminq_s16(vmaxq_s16(g, zero), max);
			vst1q_s16((INT16*) (g_buf+i), g);

			/* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)),
			 * 0, 255);
			 */
			int16x8_t b = vaddq_s16(y, cb);
			b = vaddq_s16(b, vshrq_n_s16(cb, 1));
			b = vaddq_s16(b, vshrq_n_s16(cb, 2));
			b = vaddq_s16(b, vshrq_n_s16(cb, 6));
			b = vminq_s16(vmaxq_s16(b, zero), max);
			vst1q_s16((INT16*) (b_buf+i), b);
		}
		y_buf  += srcbump;
		cb_buf += srcbump;
		cr_buf += srcbump;
		r_buf += dstbump;
		g_buf += dstbump;
		b_buf += dstbump;
	}
}
Example #8
0
inline   int16x8_t vminq(const int16x8_t   & v0, const int16x8_t   & v1) { return vminq_s16(v0, v1); }
Example #9
0
int16x8_t
test_vminq_s16 (int16x8_t __a, int16x8_t __b)
{
  return vminq_s16(__a, __b);
}
Example #10
0
pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
    const INT16 *pSrc[3],
    int srcStep,
    INT16 *pDst[3],
    int dstStep,
    const prim_size_t *roi)	/* region of interest */
{
    /* TODO: If necessary, check alignments and call the general version. */

    int16x8_t zero = vdupq_n_s16(0);
    int16x8_t max = vdupq_n_s16(255);

    int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
    int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
    int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
    int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
    int16x8_t c4096 = vdupq_n_s16(4096);

    int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
    int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
    int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
    int16x8_t* r_buf  = (int16x8_t*) pDst[0];
    int16x8_t* g_buf  = (int16x8_t*) pDst[1];
    int16x8_t* b_buf  = (int16x8_t*) pDst[2];

    int srcbump = srcStep / sizeof(int16x8_t);
    int dstbump = dstStep / sizeof(int16x8_t);
    int yp;

    int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
    for (yp=0; yp<roi->height; ++yp)
    {
        int i;
        for (i=0; i<imax; i++)
        {
            /*
            	In order to use NEON signed 16-bit integer multiplication we need to convert
            	the floating point factors to signed int without loosing information.
            	The result of this multiplication is 32 bit and we have a NEON instruction
            	that returns the hi word of the saturated double.
            	Thus we will multiply the factors by the highest possible 2^n, take the
            	upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
            	shift by 1 to reverse the doubling) and correct	this result by multiplying it
            	by 2^(16-n).
            	For the given factors in the conversion matrix the best possible n is 14.

            	Example for calculating r:
            	r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
            	r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
            	r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
            	r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
            */

            /* y = (y_buf[i] + 4096) >> 2 */
            int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
            y = vaddq_s16(y, c4096);
            y = vshrq_n_s16(y, 2);
            /* cb = cb_buf[i]; */
            int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
            /* cr = cr_buf[i]; */
            int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);

            /* (y + HIWORD(cr*22986)) >> 3 */
            int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
            r = vshrq_n_s16(r, 3);
            /* r_buf[i] = MINMAX(r, 0, 255); */
            r = vminq_s16(vmaxq_s16(r, zero), max);
            vst1q_s16((INT16*)&r_buf[i], r);

            /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
            int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
            g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
            g = vshrq_n_s16(g, 3);
            /* g_buf[i] = MINMAX(g, 0, 255); */
            g = vminq_s16(vmaxq_s16(g, zero), max);
            vst1q_s16((INT16*)&g_buf[i], g);

            /* (y + HIWORD(cb*28999)) >> 3 */
            int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
            b = vshrq_n_s16(b, 3);
            /* b_buf[i] = MINMAX(b, 0, 255); */
            b = vminq_s16(vmaxq_s16(b, zero), max);
            vst1q_s16((INT16*)&b_buf[i], b);
        }

        y_buf  += srcbump;
        cb_buf += srcbump;
        cr_buf += srcbump;
        r_buf += dstbump;
        g_buf += dstbump;
        b_buf += dstbump;
    }
    return PRIMITIVES_SUCCESS;
}