static void interpolate5LineNeon(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width, int k1, int k2) { uint16x4_t kRedBlueMask_4 = vdup_n_u16(ColorMask::kRedBlueMask); uint16x4_t kGreenMask_4 = vdup_n_u16(ColorMask::kGreenMask); uint16x4_t k1_4 = vdup_n_u16(k1); uint16x4_t k2_4 = vdup_n_u16(k2); while (width >= 4) { uint16x4_t srcA_4 = vld1_u16(srcA); uint16x4_t srcB_4 = vld1_u16(srcB); uint16x4_t p1_4 = srcB_4; uint16x4_t p2_4 = srcA_4; uint16x4_t p1_rb_4 = vand_u16(p1_4, kRedBlueMask_4); uint16x4_t p1_g_4 = vand_u16(p1_4, kGreenMask_4); uint16x4_t p2_rb_4 = vand_u16(p2_4, kRedBlueMask_4); uint16x4_t p2_g_4 = vand_u16(p2_4, kGreenMask_4); uint32x4_t tmp_rb_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_rb_4, k2_4), p1_rb_4, k1_4), 3); uint32x4_t tmp_g_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_g_4, k2_4), p1_g_4, k1_4), 3); uint16x4_t p_rb_4 = vmovn_u32(tmp_rb_4); p_rb_4 = vand_u16(p_rb_4, kRedBlueMask_4); uint16x4_t p_g_4 = vmovn_u32(tmp_g_4); p_g_4 = vand_u16(p_g_4, kGreenMask_4); uint16x4_t result_4 = p_rb_4 | p_g_4; vst1_u16(dst, result_4); dst += 4; srcA += 4; srcB += 4; width -= 4; } }
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est) { assert((uintptr_t)echo_est % 32 == 0); assert((uintptr_t)(aecm->channelStored) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); // This is C code of following optimized code. // During startup we store the channel every block. // memcpy(aecm->channelStored, // aecm->channelAdapt16, // sizeof(int16_t) * PART_LEN1); // Recalculate echo estimate // for (i = 0; i < PART_LEN; i += 4) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], // far_spectrum[i + 1]); // echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], // far_spectrum[i + 2]); // echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], // far_spectrum[i + 3]); // } // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); const uint16_t* far_spectrum_p = far_spectrum; int16_t* start_adapt_p = aecm->channelAdapt16; int16_t* start_stored_p = aecm->channelStored; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; int32_t* echo_est_p = echo_est; uint16x8_t far_spectrum_v; int16x8_t adapt_v; uint32x4_t echo_est_v_low, echo_est_v_high; while (start_stored_p < end_stored_p) { far_spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); vst1q_s16(start_stored_p, adapt_v); echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v), vget_low_u16(vreinterpretq_u16_s16(adapt_v))); echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v), vget_high_u16(vreinterpretq_u16_s16(adapt_v))); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); far_spectrum_p += 8; start_adapt_p += 8; start_stored_p += 8; echo_est_p += 8; } aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN]; echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); }
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est, uint32_t* far_energy, uint32_t* echo_energy_adapt, uint32_t* echo_energy_stored) { int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt_p = aecm->channelAdapt16; int32_t* echo_est_p = echo_est; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; const uint16_t* far_spectrum_p = far_spectrum; int16x8_t store_v, adapt_v; uint16x8_t spectrum_v; uint32x4_t echo_est_v_low, echo_est_v_high; uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v; far_energy_v = vdupq_n_u32(0); echo_adapt_v = vdupq_n_u32(0); echo_stored_v = vdupq_n_u32(0); // Get energy for the delayed far end signal and estimated // echo using both stored and adapted channels. // The C code: // for (i = 0; i < PART_LEN1; i++) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // (*far_energy) += (uint32_t)(far_spectrum[i]); // *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; // (*echo_energy_stored) += (uint32_t)echo_est[i]; // } while (start_stored_p < end_stored_p) { spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); store_v = vld1q_s16(start_stored_p); far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v)); far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v)); echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)), vget_low_u16(spectrum_v)); echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)), vget_high_u16(spectrum_v)); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v); echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_low_s16(adapt_v)), vget_low_u16(spectrum_v)); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_high_s16(adapt_v)), vget_high_u16(spectrum_v)); start_stored_p += 8; start_adapt_p += 8; far_spectrum_p += 8; echo_est_p += 8; } AddLanes(far_energy, far_energy_v); AddLanes(echo_energy_stored, echo_stored_v); AddLanes(echo_energy_adapt, echo_adapt_v); echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); *echo_energy_stored += (uint32_t)echo_est[PART_LEN]; *far_energy += (uint32_t)far_spectrum[PART_LEN]; *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN]; }
inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); }
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d) { c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val)); d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)); }
// // box blur a square array of pixels (power of 2, actually) // if we insist on powers of 2, we don't need to special case some end-of-row/col conditions // to a specific blur width // // also, we're using NEON to vectorize our arithmetic. // we need to do a division along the way, but NEON doesn't support integer division. // so rather than divide by, say "w", we multiply by magic(w). // magic(w) is chosen so that the result of multiplying by it will be the same as // dividing by w, except that the result will be in the high half of the result. // yes, dorothy... this is what compilers do, too... void NEONboxBlur(pixel *src, pixel *dest, unsigned int size, unsigned int blurRad) { unsigned int wid = 2 * blurRad + 1; // because NEON doesn't have integer division, we use "magic constants" that will give // use the result of division by multiplication -- the upper half of the result will be // (more or less) the result of the division. // for this, we need to compute the magic numbers corresponding to a given divisor struct magicu_info minfo = compute_unsigned_magic_info(wid, 16); int16x8_t preshift = vdupq_n_s16(-minfo.pre_shift); // negative means shift right int32x4_t postshift = vdupq_n_s32(-(minfo.post_shift+16)); // negative means shift right uint16x4_t magic = vdup_n_u16(minfo.multiplier); // fprintf(stderr,"width %5d, preshift %d, postshift %d + 16, increment %d, magic %d\n", wid, // minfo.pre_shift, minfo.post_shift, minfo.increment, minfo.multiplier); // if (minfo.pre_shift > 0) fprintf(stderr,"hey, not an odd number!\n"); int i, j, k, ch; for (i = 0 ; i < size ; i+=8) { // first, initialize the sum so that we can loop from 0 to size-1 // we'll initialize boxsum for index -1, so that we can move into 0 as part of our loop uint16x8x4_t boxsum; uint8x8x4_t firstpixel = vld4_u8((uint8_t *)(src + 0 * size + i)); for (ch = 0 ; ch < 4 ; ch++) { // boxsum[ch] = blurRad * srcpixel[ch] boxsum.val[ch] = vmulq_n_u16(vmovl_u8(firstpixel.val[ch]),(blurRad+1)+1); } for ( k = 1 ; k < blurRad ; k++) { uint8x8x4_t srcpixel = vld4_u8((uint8_t *)(src + k * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { boxsum.val[ch] = vaddw_u8(boxsum.val[ch], srcpixel.val[ch]); } } int right = blurRad-1; int left = -blurRad-1; if (minfo.increment) { for ( k = 0 ; k < size ; k++) { // move to next pixel unsigned int l = (left < 0)?0:left; // take off the old left left++; right++; unsigned int r = (right < size)?right:(size-1); // but add the new right uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i)); uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { // boxsum[ch] += addpixel[ch] - subpixel[ch]; boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]); } uint8x8x4_t destpixel; for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid // since 16bit multiplication leads to 32bit results, we need to // split our task into two chunks, for the hi and low half of our vector // (because otherwise, it won't all fit into 128 bits) // this is the meat of the magic division algorithm (see the include file...) uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift); // multiply by the magic number uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic); res_hi = vaddw_u16(res_hi, magic); // take the high half and post-shift uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift)); // pre-shift and multiply by the magic number uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic); res_lo = vaddw_u16(res_lo, magic); // take the high half and post-shift uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift)); destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi)); } pixel block[8]; vst4_u8((uint8_t *)&block, destpixel); for (j = 0 ; j < 8 ; j++ ) { dest[(i + j)*size + k] = block[j]; } // vst4_u8((uint8_t *)(dest + k * size + i), destpixel); } } else { for ( k = 0 ; k < size ; k++) { // move to next pixel unsigned int l = (left < 0)?0:left; // take off the old left left++; right++; unsigned int r = (right < size)?right:(size-1); // but add the new right uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i)); uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { // boxsum[ch] += addpixel[ch] - subpixel[ch]; boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]); } uint8x8x4_t destpixel; for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid // since 16bit multiplication leads to 32bit results, we need to // split our task into two chunks, for the hi and low half of our vector // (because otherwise, it won't all fit into 128 bits) // this is the meat of the magic division algorithm (see the include file...) uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift); // multiply by the magic number // take the high half and post-shift uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic); uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift)); // multiply by the magic number // take the high half and post-shift uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic); uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift)); destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi)); } pixel block[8]; vst4_u8((uint8_t *)&block, destpixel); for (j = 0 ; j < 8 ; j++ ) { dest[(i + j)*size + k] = block[j]; } // vst4_u8((uint8_t *)(dest + k * size + i), destpixel); } } } }