void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) { int16x8_t zero = vdupq_n_s16(0); int16x8_t max = vdupq_n_s16(255); int16x8_t y_add = vdupq_n_s16(128); int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer; int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer; int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer; int i; for (i = 0; i < 4096 / 8; i++) { int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]); y = vaddq_s16(y, y_add); int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]); // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255); int16x8_t r = vaddq_s16(y, cr); r = vaddq_s16(r, vshrq_n_s16(cr, 2)); r = vaddq_s16(r, vshrq_n_s16(cr, 3)); r = vaddq_s16(r, vshrq_n_s16(cr, 5)); r = vminq_s16(vmaxq_s16(r, zero), max); vst1q_s16((sint16*)&y_r_buf[i], r); // cb = cb_g_buf[i]; int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]); // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); g = vsubq_s16(g, vshrq_n_s16(cb, 4)); g = vsubq_s16(g, vshrq_n_s16(cb, 5)); g = vsubq_s16(g, vshrq_n_s16(cr, 1)); g = vsubq_s16(g, vshrq_n_s16(cr, 3)); g = vsubq_s16(g, vshrq_n_s16(cr, 4)); g = vsubq_s16(g, vshrq_n_s16(cr, 5)); g = vminq_s16(vmaxq_s16(g, zero), max); vst1q_s16((sint16*)&cb_g_buf[i], g); // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255); int16x8_t b = vaddq_s16(y, cb); b = vaddq_s16(b, vshrq_n_s16(cb, 1)); b = vaddq_s16(b, vshrq_n_s16(cb, 2)); b = vaddq_s16(b, vshrq_n_s16(cb, 6)); b = vminq_s16(vmaxq_s16(b, zero), max); vst1q_s16((sint16*)&cr_b_buf[i], b); } }
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
void test_vmaxQs16 (void) { int16x8_t out_int16x8_t; int16x8_t arg0_int16x8_t; int16x8_t arg1_int16x8_t; out_int16x8_t = vmaxq_s16 (arg0_int16x8_t, arg1_int16x8_t); }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
int16x8_t test_vmaxq_s16(int16x8_t a1, int16x8_t a2) { // CHECK: test_vmaxq_s16 return vmaxq_s16(a1, a2); // CHECK llvm.aarch64.neon.smax.v8i16 }
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; if (!skip_block) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; const int16x8_t v_zero = vdupq_n_s16(0); const int16x8_t v_one = vdupq_n_s16(1); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); int16x8_t v_round = vmovq_n_s16(round_ptr[1]); int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); // adjust for dc v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); // process dc and the first seven ac coeffs { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); v_round = vmovq_n_s16(round_ptr[1]); v_quant = vmovq_n_s16(quant_ptr[1]); v_dequant = vmovq_n_s16(dequant_ptr[1]); } // now process the rest of the ac coeffs for (i = 8; i < count; i += 8) { const int16x8_t v_iscan = vld1q_s16(&iscan[i]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } { const int16x4_t v_eobmax_3210 = vmax_s16( vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); const int64x1_t v_eobmax_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); const int16x4_t v_eobmax_tmp = vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); const int64x1_t v_eobmax_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } } else { memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); *eob_ptr = 0; } }
PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { /* TODO: If necessary, check alignments and call the general version. */ int16x8_t zero = vdupq_n_s16(0); int16x8_t max = vdupq_n_s16(255); int16x8_t y_add = vdupq_n_s16(128); int16x8_t* y_buf = (int16x8_t*) pSrc[0]; int16x8_t* cb_buf = (int16x8_t*) pSrc[1]; int16x8_t* cr_buf = (int16x8_t*) pSrc[2]; int16x8_t* r_buf = (int16x8_t*) pDst[0]; int16x8_t* g_buf = (int16x8_t*) pDst[1]; int16x8_t* b_buf = (int16x8_t*) pDst[2]; int srcbump = srcStep / sizeof(int16x8_t); int dstbump = dstStep / sizeof(int16x8_t); int yp; int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { int16x8_t y = vld1q_s16((INT16*) (y_buf+i)); y = vaddq_s16(y, y_add); int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i)); /* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), * 0, 255); */ int16x8_t r = vaddq_s16(y, cr); r = vaddq_s16(r, vshrq_n_s16(cr, 2)); r = vaddq_s16(r, vshrq_n_s16(cr, 3)); r = vaddq_s16(r, vshrq_n_s16(cr, 5)); r = vminq_s16(vmaxq_s16(r, zero), max); vst1q_s16((INT16*) (r_buf+i), r); /* cb = cb_g_buf[i]; */ int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i)); /* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) * - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */ int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); g = vsubq_s16(g, vshrq_n_s16(cb, 4)); g = vsubq_s16(g, vshrq_n_s16(cb, 5)); g = vsubq_s16(g, vshrq_n_s16(cr, 1)); g = vsubq_s16(g, vshrq_n_s16(cr, 3)); g = vsubq_s16(g, vshrq_n_s16(cr, 4)); g = vsubq_s16(g, vshrq_n_s16(cr, 5)); g = vminq_s16(vmaxq_s16(g, zero), max); vst1q_s16((INT16*) (g_buf+i), g); /* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), * 0, 255); */ int16x8_t b = vaddq_s16(y, cb); b = vaddq_s16(b, vshrq_n_s16(cb, 1)); b = vaddq_s16(b, vshrq_n_s16(cb, 2)); b = vaddq_s16(b, vshrq_n_s16(cb, 6)); b = vminq_s16(vmaxq_s16(b, zero), max); vst1q_s16((INT16*) (b_buf+i), b); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } }
inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); }
int16x8_t test_vmaxq_s16 (int16x8_t __a, int16x8_t __b) { return vmaxq_s16(__a, __b); }
pstatus_t neon_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { /* TODO: If necessary, check alignments and call the general version. */ int16x8_t zero = vdupq_n_s16(0); int16x8_t max = vdupq_n_s16(255); int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14 int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14 int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14 int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14 int16x8_t c4096 = vdupq_n_s16(4096); int16x8_t* y_buf = (int16x8_t*) pSrc[0]; int16x8_t* cb_buf = (int16x8_t*) pSrc[1]; int16x8_t* cr_buf = (int16x8_t*) pSrc[2]; int16x8_t* r_buf = (int16x8_t*) pDst[0]; int16x8_t* g_buf = (int16x8_t*) pDst[1]; int16x8_t* b_buf = (int16x8_t*) pDst[2]; int srcbump = srcStep / sizeof(int16x8_t); int dstbump = dstStep / sizeof(int16x8_t); int yp; int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { /* In order to use NEON signed 16-bit integer multiplication we need to convert the floating point factors to signed int without loosing information. The result of this multiplication is 32 bit and we have a NEON instruction that returns the hi word of the saturated double. Thus we will multiply the factors by the highest possible 2^n, take the upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right shift by 1 to reverse the doubling) and correct this result by multiplying it by 2^(16-n). For the given factors in the conversion matrix the best possible n is 14. Example for calculating r: r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_buf[i] + 4096) >> 2 */ int16x8_t y = vld1q_s16((INT16*) &y_buf[i]); y = vaddq_s16(y, c4096); y = vshrq_n_s16(y, 2); /* cb = cb_buf[i]; */ int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]); /* cr = cr_buf[i]; */ int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]); /* (y + HIWORD(cr*22986)) >> 3 */ int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1)); r = vshrq_n_s16(r, 3); /* r_buf[i] = MINMAX(r, 0, 255); */ r = vminq_s16(vmaxq_s16(r, zero), max); vst1q_s16((INT16*)&r_buf[i], r); /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1)); g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1)); g = vshrq_n_s16(g, 3); /* g_buf[i] = MINMAX(g, 0, 255); */ g = vminq_s16(vmaxq_s16(g, zero), max); vst1q_s16((INT16*)&g_buf[i], g); /* (y + HIWORD(cb*28999)) >> 3 */ int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1)); b = vshrq_n_s16(b, 3); /* b_buf[i] = MINMAX(b, 0, 255); */ b = vminq_s16(vmaxq_s16(b, zero), max); vst1q_s16((INT16*)&b_buf[i], b); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } return PRIMITIVES_SUCCESS; }