static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
unsigned int vpx_get4x4sse_cs_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; d0u8 = vld1_u8(src_ptr); src_ptr += source_stride; d4u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d1u8 = vld1_u8(src_ptr); src_ptr += source_stride; d5u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d2u8 = vld1_u8(src_ptr); src_ptr += source_stride; d6u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d3u8 = vld1_u8(src_ptr); src_ptr += source_stride; d7u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(d0u8, d4u8); q12u16 = vsubl_u8(d1u8, d5u8); q13u16 = vsubl_u8(d2u8, d6u8); q14u16 = vsubl_u8(d3u8, d7u8); d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); q7s32 = vmull_s16(d22s16, d22s16); q8s32 = vmull_s16(d24s16, d24s16); q9s32 = vmull_s16(d26s16, d26s16); q10s32 = vmull_s16(d28s16, d28s16); q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q9s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q9s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; uint16x8_t vec_sum_lo = vdupq_n_u16(0); uint16x8_t vec_sum_hi = vdupq_n_u16(0); const int shift_factor = ((height >> 5) + 3) * -1; const int16x8_t vec_shift = vdupq_n_s16(shift_factor); for (i = 0; i < height; i += 8) { const uint8x16_t vec_row1 = vld1q_u8(ref); const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); ref += ref_stride * 8; } vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); hbuf += 8; vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); }
void idct_dequant_0_2x_neon( int16_t *q, int16_t dq, unsigned char *dst, int stride) { unsigned char *dst0; int i, a0, a1; int16x8x2_t q2Add; int32x2_t d2s32, d4s32; uint8x8_t d2u8, d4u8; uint16x8_t q1u16, q2u16; a0 = ((q[0] * dq) + 4) >> 3; a1 = ((q[16] * dq) + 4) >> 3; q[0] = q[16] = 0; q2Add.val[0] = vdupq_n_s16((int16_t)a0); q2Add.val[1] = vdupq_n_s16((int16_t)a1); for (i = 0; i < 2; i++, dst += 4) { dst0 = dst; d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); dst0 += stride; d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); dst0 += stride; d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); dst0 += stride; d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), vreinterpret_u8_s32(d2s32)); q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), vreinterpret_u8_s32(d4s32)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); d2s32 = vreinterpret_s32_u8(d2u8); d4s32 = vreinterpret_s32_u8(d4u8); dst0 = dst; vst1_lane_s32((int32_t *)dst0, d2s32, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d2s32, 1); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d4s32, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d4s32, 1); } return; }
static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
void vp9_idct8x8_1_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d2u8, d3u8, d30u8, d31u8; uint64x1_t d2u64, d3u64, d4u64, d5u64; uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q0s16; uint8_t *d1, *d2; int16_t i, a1, cospi_16_64 = 11585; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 5); q0s16 = vdupq_n_s16(a1); q0u16 = vreinterpretq_u16_s16(q0s16); d1 = d2 = dest; for (i = 0; i < 2; i++) { d2u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d4u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d5u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); d2 += dest_stride; } return; }
static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, unsigned int *sse, int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); int32x4_t v_sse_hi = vdupq_n_s32(0); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const uint8x8_t v_a = vld1_u8(&a[j]); const uint8x8_t v_b = vld1_u8(&b[j]); const uint16x8_t v_diff = vsubl_u8(v_a, v_b); const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); v_sum = vaddq_s16(v_sum, sv_diff); v_sse_lo = vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); v_sse_hi = vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); } a += a_stride; b += b_stride; } *sum = horizontal_add_s16x8(v_sum); *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); }
void test_vreinterpretQs16_u16 (void) { int16x8_t out_int16x8_t; uint16x8_t arg0_uint16x8_t; out_int16x8_t = vreinterpretq_s16_u16 (arg0_uint16x8_t); }
void vpx_idct4x4_1_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d6u8; uint32x2_t d2u32 = vdup_n_u32(0); uint16x8_t q8u16; int16x8_t q0s16; uint8_t *d1, *d2; int16_t i, a1, cospi_16_64 = 11585; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 4); q0s16 = vdupq_n_s16(a1); // dc_only_idct_add d1 = d2 = dest; for (i = 0; i < 2; i++) { d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); d1 += dest_stride; d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); d2 += dest_stride; vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); d2 += dest_stride; } return; }
static inline void char_to_float_vectors(const unsigned char * sourcep, float32x4_t *mp0, float32x4_t * mp1) { uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ int16x4_t high16, low16; int32x4_t high32, low32; const int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128}; rawpixels = vld1_u8(sourcep); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* subtract uvbias from widerpixels */ widerpixels = vsubq_s16(widerpixels, uvbias); /* now take widerpixels apart into (low16, high16) and */ /* then expand those into (low32, high32) */ low16 = vget_low_s16(widerpixels); high16 = vget_high_s16(widerpixels); high32 = vmovl_s16(high16); low32 = vmovl_s16(low16); /* now convert low32 and high32 into floats and store them in */ /* *mp0, *mp1 */ *mp0 = vcvtq_f32_s32(low32); *mp1 = vcvtq_f32_s32(high32); }
void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint16x8_t q1u16, q3u16; int16x8_t q1s16; uint8x8_t d0u8 = vdup_n_u8(0); uint32x2_t d2u32 = vdup_n_u32(0); d0u8 = vld1_dup_u8(above - 1); d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); for (i = 0; i < 4; i++, dst += stride) { q1u16 = vdupq_n_u16((uint16_t)left[i]); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); d0u8 = vqmovun_s16(q1s16); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); } }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
QT_BEGIN_NAMESPACE static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half) { // result = (x + (x >> 8) + 0x80) >> 8 const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8 const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80 const int16x8_t sum = vaddq_s16(temp, sum_part); return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8)); }
static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, const int stride, const int16x8_t res) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const uint16x8_t c0 = vqshluq_n_s16(b0, 0); const uint16x8_t c1 = vqshluq_n_s16(b1, 0); const uint16x8_t c2 = vqshluq_n_s16(b2, 0); const uint16x8_t c3 = vqshluq_n_s16(b3, 0); vst1q_u16(*dest, c0); vst1q_u16(*dest + 8, c1); vst1q_u16(*dest + 16, c2); vst1q_u16(*dest + 24, c3); *dest += stride; }
static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const int16x8_t c0 = vminq_s16(b0, max); const int16x8_t c1 = vminq_s16(b1, max); const int16x8_t c2 = vminq_s16(b2, max); const int16x8_t c3 = vminq_s16(b3, max); vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); *dest += stride; }
static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, const int16x8_t res, uint32x2_t *const d) { uint16x8_t a; uint8x8_t b; *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0); *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1); a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d)); b = vqmovun_s16(vreinterpretq_s16_u16(a)); vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0); *dest += stride; vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1); *dest += stride; }
// res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x4_t a0 = vld1_u16(*dest); const uint16x4_t a1 = vld1_u16(*dest + stride); const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); // Note: In some profile tests, res is quite close to +/-32767. // We use saturating addition. const int16x8_t b = vqaddq_s16(res, a); const int16x8_t c = vminq_s16(b, max); const uint16x8_t d = vqshluq_n_s16(c, 0); vst1_u16(*dest, vget_high_u16(d)); *dest += stride; vst1_u16(*dest, vget_low_u16(d)); *dest += stride; }
static INLINE void scaledconvolve_vert_w4( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filters, const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; src -= src_stride * (SUBPEL_TAPS / 2 - 1); y = h; do { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; if (y_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x4_t t[8], tt; load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, filter3, filter4); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } else { memcpy(dst, &src_y[3 * src_stride], w); } dst += dst_stride; y_q4 += y_step_q4; } while (--y); }
void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j; uint16x8_t q0u16, q3u16, q10u16; int16x8_t q0s16; uint16x4_t d20u16; uint8x8_t d0u8, d2u8, d30u8; d0u8 = vld1_dup_u8(above - 1); d30u8 = vld1_u8(left); d2u8 = vld1_u8(above); q10u16 = vmovl_u8(d30u8); q3u16 = vsubl_u8(d2u8, d0u8); d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { q0u16 = vdupq_lane_u16(d20u16, 0); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 1); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 2); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 3); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; } }
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count, const int* xofs, const short* alpha, int swidth, int dwidth, int cn, int xmin, int xmax) { int dx, k; int dx0 = 0; int16x4x2_t alpha_vec; uint8x8_t dS0_vec, dS1_vec; int16x8_t qS0_vec, qS1_vec; int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567; int32x4_t qT0_vec, qT1_vec; int16x4_t dCoeff; dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE); for (k = 0; k <= count - 2; k++) { const unsigned char *S0 = src[k], *S1 = src[k + 1]; int *D0 = dst[k], *D1 = dst[k + 1]; for (dx = dx0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); dS1_4567 = vget_high_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); qT1_vec = vmull_s16 (dS1_0123, dCoeff); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } } for (; k < count; k++) { const unsigned char *S = src[k]; int *D = dst[k]; for (dx = 0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); vst1q_s32 (&D[dx], qT0_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); vst1q_s32 (&D[dx], qT0_vec); } } }
void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { int i; uint32x2_t d6u32 = vdup_n_u32(0); uint8x8_t d1u8; int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; uint16x8_t q1u16; int16x8_t q1s16, q2s16, q3s16, q4s16; int32x2x2_t v2tmp0, v2tmp1; int16x4x2_t v2tmp2, v2tmp3; d2 = vld1_s16(input); d3 = vld1_s16(input + 4); d4 = vld1_s16(input + 8); d5 = vld1_s16(input + 12); // 1st for loop q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here q2s16 = vcombine_s16(d3, d5); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); // 2nd for loop q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); d2 = vrshr_n_s16(d2, 3); d3 = vrshr_n_s16(d3, 3); d4 = vrshr_n_s16(d4, 3); d5 = vrshr_n_s16(d5, 3); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); // dc_only_idct_add for (i = 0; i < 2; i++, q1s16 = q2s16) { d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); pred_ptr += pred_stride; d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); pred_ptr += pred_stride; q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); dst_ptr += dst_stride; vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); dst_ptr += dst_stride; } return; }
void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d26u8, d27u8; uint32x2_t d26u32, d27u32; uint16x8_t q8u16, q9u16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; int16x8_t q8s16, q9s16, q13s16, q14s16; int32x4_t q1s32, q13s32, q14s32, q15s32; int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_low_s16(q9s16); d19s16 = vget_high_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); d20s16 = vdup_n_s16((int16_t)cospi_8_64); d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_high_s16(q9s16); // vswp d18 d19 d19s16 = vget_low_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); // do the transform on columns // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d = dest; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); d += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d = dest; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); return; }
static INLINE void scaledconvolve_horiz_w4( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filters, const int x0_q4, const int x_step_q4, const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; y = h; do { int x_q4 = x0_q4; x = 0; do { // process 4 src_x steps for (z = 0; z < 4; ++z) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x8_t ss[4]; int16x4_t t[8], tt; load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); t[0] = vget_low_s16(ss[0]); t[1] = vget_low_s16(ss[1]); t[2] = vget_low_s16(ss[2]); t[3] = vget_low_s16(ss[3]); t[4] = vget_high_s16(ss[0]); t[5] = vget_high_s16(ss[1]); t[6] = vget_high_s16(ss[2]); t[7] = vget_high_s16(ss[3]); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, filter3, filter4); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); } else { int i; for (i = 0; i < 4; ++i) { temp[z * 4 + i] = src_x[i * src_stride + 3]; } } x_q4 += x_step_q4; } // transpose the 4x4 filters values back to dst { const uint8x8x4_t d4 = vld4_u8(temp); vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], vreinterpret_u32_u8(d4.val[0]), 0); vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], vreinterpret_u32_u8(d4.val[1]), 0); vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], vreinterpret_u32_u8(d4.val[2]), 0); vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], vreinterpret_u32_u8(d4.val[3]), 0); } x += 4; } while (x < w); src += src_stride * 4; dst += dst_stride * 4; y -= 4; } while (y > 0); }
void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len) { int block; uint8_t uv_buf[2 * 32 + 15]; uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); const int uv_len = (len + 1) >> 1; const int num_blocks = (uv_len - 1) >> 3; const int leftover = uv_len - num_blocks * 8; const int last_pos = 1 + 16 * num_blocks; const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; const int16x4_t cf16 = vld1_s16(coef); const int32x2_t cf32 = vmov_n_s32(76283); const uint8x8_t u16 = vmov_n_u8(16); const uint8x8_t u128 = vmov_n_u8(128); for (block = 0; block < num_blocks; ++block) { { uint8x8_t a = vld1_u8(top_u); uint8x8_t b = vld1_u8(top_u + 1); uint8x8_t c = vld1_u8(cur_u); uint8x8_t d = vld1_u8(cur_u + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv, a_b); vst2_u8(r_uv + 32, c_d); } } { uint8x8_t a = vld1_u8(top_v); uint8x8_t b = vld1_u8(top_v + 1); uint8x8_t c = vld1_u8(cur_v); uint8x8_t d = vld1_u8(cur_v + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv + 16, a_b); vst2_u8(r_uv + 16 + 32, c_d); } } { if (top_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8((r_uv) + i); uint8x8_t v = vld1_u8((r_uv) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(top_dst + off, r_g_b_v255); } while (0); } } } if (bottom_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8(((r_uv) + 32) + i); uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(bottom_dst + off, r_g_b_v255); } while (0); } } } } } }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; /* mask to shuffle from desc. to mbuf */ uint8x16_t shuf_msk = { 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /* pkt_type set as unknown */ 14, 15, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 14, 15, /* octet 15~14, 16 bits data_len */ 2, 3, /* octet 2~3, low 16 bits vlan_macip */ 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; uint8x16_t eop_check = { 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0 /* ignore non-length fields */ }; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch_non_temporal(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP]; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint16x8x2_t sterr_tmp1, sterr_tmp2; uint64x2_t mbp1, mbp2; uint16x8_t staterr; uint16x8_t tmp; uint64_t stat; int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT}; /* B.1 load 1 mbuf point */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = vld1q_u64((uint64_t *)(rxdp + 3)); rte_rmb(); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); descs[2] = vld1q_u64((uint64_t *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = vld1q_u64((uint64_t *)(rxdp + 1)); descs[0] = vld1q_u64((uint64_t *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); descs[3] = vreinterpretq_u64_u32(len3); uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]), len_shl); descs[2] = vreinterpretq_u64_u32(len2); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), vreinterpretq_u16_u64(descs[3])); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), vreinterpretq_u16_u64(descs[2])); /* C.2 get 4 pkts staterr value */ staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]), len_shl); descs[1] = vreinterpretq_u64_u32(len1); uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]), len_shl); descs[0] = vreinterpretq_u64_u32(len0); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); /* D.3 copy final 3,4 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4); vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { 0x00, 0x02, 0x04, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; uint8x16_t eop_bits; /* and with mask to extract bits, flipping 1-0 */ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); eop_bits = vandq_u8(eop_bits, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ vst1q_lane_u32((uint32_t *)split_packet, vreinterpretq_u32_u8(eop_bits), 0); split_packet += RTE_I40E_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), I40E_UINT16_BIT - 1)); stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0); rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); /* D.3 copy final 1,2 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2); vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ if (unlikely(stat == 0)) { nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP; } else { nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT; break; } } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, const TxfmParam *txfm_param) { int i; uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 8 * 2); q11s16 = vld1q_s16(input + 8 * 3); q12s16 = vld1q_s16(input + 8 * 4); q13s16 = vld1q_s16(input + 8 * 5); q14s16 = vld1q_s16(input + 8 * 6); q15s16 = vld1q_s16(input + 8 * 7); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); const TX_TYPE tx_type = txfm_param->tx_type; switch (tx_type) { case DCT_DCT: // idct_idct is not supported. Fall back to C av1_iht8x8_64_add_c(input, dest, dest_stride, txfm_param); return; break; case ADST_DCT: // iadst_idct // generate IDCT constants // GENERATE_IDCT_CONSTANTS // first transform rows IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // generate IADST constants // GENERATE_IADST_CONSTANTS // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; case DCT_ADST: // idct_iadst // generate IADST constants // GENERATE_IADST_CONSTANTS // first transform rows IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // generate IDCT constants // GENERATE_IDCT_CONSTANTS // then transform columns IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; case ADST_ADST: // iadst_iadst // generate IADST constants // GENERATE_IADST_CONSTANTS // first transform rows IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; default: // iadst_idct assert(0); break; } q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); for (d1 = d2 = dest, i = 0; i < 2; i++) { if (i != 0) { q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; } d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; } return; }
void vpx_idct8x8_64_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
void yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count, unsigned char * destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0}; const int16x8_t v_coeff = {90, -46, 0, 0, 90, -46, 0, 0}; const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF}; const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; int16x8_t mp0_rgba; /* macropixel 0's resulting RGBA RGBA pixels */ int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels */ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ uint8x8_t rgba0, rgba1; /* rgba values as bytes */ uint8x16_t bothrgba; uint8_t * destinationp; /* pointer into output buffer destp */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; destinationp = (uint8_t *)destp; while (sourcep < vector_endp) { /* pull YUYV from 2 four byte macropixels starting at sourcep. */ /* we'll increment sourcep as we go to save the array dereference */ /* and separate increment instruction at the end of the loop */ /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */ rawpixels = vld1_u8(sourcep); sourcep += sizeof(rawpixels); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* ---------- process macropixel 0 --------------- */ /* take macropixel zero ([YUYV]0) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp0_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]0 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP0_Y0); y1_vec = vdup_lane_u8(rawpixels, MP0_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); /* store ALPHA in elements 3 and 7 (after the RGB components) */ narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ /* change interpretation from unsigned 16-bit to signed */ /* 16-bit. */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_low_s16(widerpixels); /* form a vector of the U component from MP0 */ u_vec = vdupq_lane_s16(yuyv, MP0_U); /* subtract uvbias from u_vec */ u_vec = vsubq_s16(u_vec, uvbias); /* form a vector of the V component from MP0 */ v_vec = vdupq_lane_s16(yuyv, MP0_V); /* subtract uvbias from v_vec */ v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp0_rgba = y_vec + u_vec + v_vec */ mp0_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* ---------- process macropixel 1 --------------- */ /* take macropixel one ([YUYV]1) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp1_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]1 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP1_Y0); y1_vec = vdup_lane_u8(rawpixels, MP1_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_high_s16(widerpixels); u_vec = vdupq_lane_s16(yuyv, 1); u_vec = vsubq_s16(u_vec, uvbias); v_vec = vdupq_lane_s16(yuyv, 3); v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp1_rgba = y_vec + u_vec + v_vec */ mp1_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* turn mp0_rgba from a vector of shorts to a vector of */ /* unsigned unsigned chars. this will saturate: clipping */ /* the values between 0 and 255. */ rgba0 = vqmovun_s16(mp0_rgba); rgba1 = vqmovun_s16(mp1_rgba); /* make it faster to copy these back out of vector registers into */ /* memory by combining rgba0 and rgba1 into the larger bothrgba. */ /* then store that back into memory at destinationp. */ bothrgba = vcombine_u8(rgba0, rgba1); vst1q_u8(destinationp, bothrgba); destinationp += 16; } }
void vpx_idct8x8_12_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; int16x4_t d26s16, d27s16, d28s16, d29s16; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // First transform rows // stage 1 q0s16 = vdupq_n_s16(cospi_28_64 * 2); q1s16 = vdupq_n_s16(cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q0s16 = vdupq_n_s16(-cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); q1s16 = vdupq_n_s16(cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); q0s16 = vdupq_n_s16(cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half q1s16 = vdupq_n_s16(cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); q0s16 = vdupq_n_s16(cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); q15s16 = vqrdmulhq_s16(q10s16, q0s16); // stage 3 -odd half q0s16 = vaddq_s16(q9s16, q15s16); q1s16 = vaddq_s16(q9s16, q13s16); q2s16 = vsubq_s16(q9s16, q13s16); q3s16 = vsubq_s16(q9s16, q15s16); // stage 2 - odd half q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 q8s16 = vaddq_s16(q0s16, q7s16); q9s16 = vaddq_s16(q1s16, q6s16); q10s16 = vaddq_s16(q2s16, q5s16); q11s16 = vaddq_s16(q3s16, q4s16); q12s16 = vsubq_s16(q3s16, q4s16); q13s16 = vsubq_s16(q2s16, q5s16); q14s16 = vsubq_s16(q1s16, q6s16); q15s16 = vsubq_s16(q0s16, q7s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = _src.ptr<short>(); short* dst = _dst.ptr<short>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; CV_SUPPRESS_DEPRECATED_START switch( type ) { case THRESH_TRUNC: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } CV_SUPPRESS_DEPRECATED_END } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }