void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len) { int block; uint8_t uv_buf[2 * 32 + 15]; uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); const int uv_len = (len + 1) >> 1; const int num_blocks = (uv_len - 1) >> 3; const int leftover = uv_len - num_blocks * 8; const int last_pos = 1 + 16 * num_blocks; const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; const int16x4_t cf16 = vld1_s16(coef); const int32x2_t cf32 = vmov_n_s32(76283); const uint8x8_t u16 = vmov_n_u8(16); const uint8x8_t u128 = vmov_n_u8(128); for (block = 0; block < num_blocks; ++block) { { uint8x8_t a = vld1_u8(top_u); uint8x8_t b = vld1_u8(top_u + 1); uint8x8_t c = vld1_u8(cur_u); uint8x8_t d = vld1_u8(cur_u + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv, a_b); vst2_u8(r_uv + 32, c_d); } } { uint8x8_t a = vld1_u8(top_v); uint8x8_t b = vld1_u8(top_v + 1); uint8x8_t c = vld1_u8(cur_v); uint8x8_t d = vld1_u8(cur_v + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv + 16, a_b); vst2_u8(r_uv + 16 + 32, c_d); } } { if (top_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8((r_uv) + i); uint8x8_t v = vld1_u8((r_uv) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(top_dst + off, r_g_b_v255); } while (0); } } } if (bottom_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8(((r_uv) + 32) + i); uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(bottom_dst + off, r_g_b_v255); } while (0); } } } } } }
void yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count, unsigned char * destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0}; const int16x8_t v_coeff = {90, -46, 0, 0, 90, -46, 0, 0}; const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF}; const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; int16x8_t mp0_rgba; /* macropixel 0's resulting RGBA RGBA pixels */ int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels */ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ uint8x8_t rgba0, rgba1; /* rgba values as bytes */ uint8x16_t bothrgba; uint8_t * destinationp; /* pointer into output buffer destp */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; destinationp = (uint8_t *)destp; while (sourcep < vector_endp) { /* pull YUYV from 2 four byte macropixels starting at sourcep. */ /* we'll increment sourcep as we go to save the array dereference */ /* and separate increment instruction at the end of the loop */ /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */ rawpixels = vld1_u8(sourcep); sourcep += sizeof(rawpixels); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* ---------- process macropixel 0 --------------- */ /* take macropixel zero ([YUYV]0) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp0_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]0 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP0_Y0); y1_vec = vdup_lane_u8(rawpixels, MP0_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); /* store ALPHA in elements 3 and 7 (after the RGB components) */ narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ /* change interpretation from unsigned 16-bit to signed */ /* 16-bit. */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_low_s16(widerpixels); /* form a vector of the U component from MP0 */ u_vec = vdupq_lane_s16(yuyv, MP0_U); /* subtract uvbias from u_vec */ u_vec = vsubq_s16(u_vec, uvbias); /* form a vector of the V component from MP0 */ v_vec = vdupq_lane_s16(yuyv, MP0_V); /* subtract uvbias from v_vec */ v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp0_rgba = y_vec + u_vec + v_vec */ mp0_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* ---------- process macropixel 1 --------------- */ /* take macropixel one ([YUYV]1) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp1_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]1 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP1_Y0); y1_vec = vdup_lane_u8(rawpixels, MP1_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_high_s16(widerpixels); u_vec = vdupq_lane_s16(yuyv, 1); u_vec = vsubq_s16(u_vec, uvbias); v_vec = vdupq_lane_s16(yuyv, 3); v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp1_rgba = y_vec + u_vec + v_vec */ mp1_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* turn mp0_rgba from a vector of shorts to a vector of */ /* unsigned unsigned chars. this will saturate: clipping */ /* the values between 0 and 255. */ rgba0 = vqmovun_s16(mp0_rgba); rgba1 = vqmovun_s16(mp1_rgba); /* make it faster to copy these back out of vector registers into */ /* memory by combining rgba0 and rgba1 into the larger bothrgba. */ /* then store that back into memory at destinationp. */ bothrgba = vcombine_u8(rgba0, rgba1); vst1q_u8(destinationp, bothrgba); destinationp += 16; } }
void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, const TxfmParam *txfm_param) { int i; uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 8 * 2); q11s16 = vld1q_s16(input + 8 * 3); q12s16 = vld1q_s16(input + 8 * 4); q13s16 = vld1q_s16(input + 8 * 5); q14s16 = vld1q_s16(input + 8 * 6); q15s16 = vld1q_s16(input + 8 * 7); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); const TX_TYPE tx_type = txfm_param->tx_type; switch (tx_type) { case DCT_DCT: // idct_idct is not supported. Fall back to C av1_iht8x8_64_add_c(input, dest, dest_stride, txfm_param); return; break; case ADST_DCT: // iadst_idct // generate IDCT constants // GENERATE_IDCT_CONSTANTS // first transform rows IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // generate IADST constants // GENERATE_IADST_CONSTANTS // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; case DCT_ADST: // idct_iadst // generate IADST constants // GENERATE_IADST_CONSTANTS // first transform rows IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // generate IDCT constants // GENERATE_IDCT_CONSTANTS // then transform columns IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; case ADST_ADST: // iadst_iadst // generate IADST constants // GENERATE_IADST_CONSTANTS // first transform rows IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; default: // iadst_idct assert(0); break; } q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); for (d1 = d2 = dest, i = 0; i < 2; i++) { if (i != 0) { q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; } d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; } return; }
void vp10_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type) { uint8x8_t d26u8, d27u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; uint32x2_t d26u32, d27u32; int16x8_t q3s16, q8s16, q9s16; uint16x8_t q8u16, q9u16; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); TRANSPOSE4X4(&q8s16, &q9s16); switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C vp10_iht4x4_16_add_c(input, dest, dest_stride, tx_type); return; break; case 1: // iadst_idct // generate constants GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); // first transform rows IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); // transpose the matrix TRANSPOSE4X4(&q8s16, &q9s16); // then transform columns IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); break; case 2: // idct_iadst // generate constantsyy GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); // first transform rows IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); // transpose the matrix TRANSPOSE4X4(&q8s16, &q9s16); // then transform columns IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); break; case 3: // iadst_iadst // generate constants GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); // first transform rows IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); // transpose the matrix TRANSPOSE4X4(&q8s16, &q9s16); // then transform columns IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); break; default: // iadst_idct assert(0); break; } q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); dest += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); dest += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); dest += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); dest -= dest_stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); dest -= dest_stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); dest -= dest_stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); return; }
void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d26u8, d27u8; uint32x2_t d26u32, d27u32; uint16x8_t q8u16, q9u16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; int16x8_t q8s16, q9s16, q13s16, q14s16; int32x4_t q1s32, q13s32, q14s32, q15s32; int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = load_tran_low_to_s16(input); q9s16 = load_tran_low_to_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_low_s16(q9s16); d19s16 = vget_high_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); d20s16 = vdup_n_s16((int16_t)cospi_8_64); d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_high_s16(q9s16); // vswp d18 d19 d19s16 = vget_low_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); // do the transform on columns // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d = dest; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); d += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d = dest; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); }
void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; uint8x16_t q0u8, q1u8, q2u8; int16x8_t q12s16, q13s16, q14s16, q15s16; uint16x4_t d6u16; uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; q0u8 = vld1q_dup_u8(above - 1); q1u8 = vld1q_u8(above); q2u8 = vld1q_u8(above + 16); q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); for (k = 0; k < 4; k++, left += 8) { d26u8 = vld1_u8(left); q3u16 = vmovl_u8(d26u8); d6u16 = vget_low_u16(q3u16); for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { q0u16 = vdupq_lane_u16(d6u16, 0); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; q0u16 = vdupq_lane_u16(d6u16, 1); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; q0u16 = vdupq_lane_u16(d6u16, 2); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; q0u16 = vdupq_lane_u16(d6u16, 3); q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q11u16)); d0u8 = vqmovun_s16(q12s16); d1u8 = vqmovun_s16(q13s16); d2u8 = vqmovun_s16(q14s16); d3u8 = vqmovun_s16(q15s16); q0u8 = vcombine_u8(d0u8, d1u8); q1u8 = vcombine_u8(d2u8, d3u8); vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); dst += stride; } } }
void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; uint8x16_t q0u8, q1u8; int16x8_t q0s16, q1s16, q8s16, q11s16; uint16x4_t d20u16; uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; q0u8 = vld1q_dup_u8(above - 1); q1u8 = vld1q_u8(above); q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); for (k = 0; k < 2; k++, left += 8) { d18u8 = vld1_u8(left); q10u16 = vmovl_u8(d18u8); d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { q0u16 = vdupq_lane_u16(d20u16, 0); q8u16 = vdupq_lane_u16(d20u16, 1); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); d2u8 = vqmovun_s16(q1s16); d3u8 = vqmovun_s16(q0s16); d22u8 = vqmovun_s16(q11s16); d23u8 = vqmovun_s16(q8s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); dst += stride; vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 2); q8u16 = vdupq_lane_u16(d20u16, 3); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); d2u8 = vqmovun_s16(q1s16); d3u8 = vqmovun_s16(q0s16); d22u8 = vqmovun_s16(q11s16); d23u8 = vqmovun_s16(q8s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); dst += stride; vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); dst += stride; } } }
void idct_dequant_full_2x_neon( int16_t *q, int16_t *dq, unsigned char *dst, int stride) { unsigned char *dst0, *dst1; int32x2_t d28, d29, d30, d31; int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; int16x8_t qEmpty = vdupq_n_s16(0); int32x4x2_t q2tmp0, q2tmp1; int16x8x2_t q2tmp2, q2tmp3; int16x4_t dLow0, dLow1, dHigh0, dHigh1; d28 = d29 = d30 = d31 = vdup_n_s32(0); // load dq q0 = vld1q_s16(dq); dq += 8; q1 = vld1q_s16(dq); // load q q2 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q3 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q4 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q5 = vld1q_s16(q); vst1q_s16(q, qEmpty); // load src from dst dst0 = dst; dst1 = dst + 4; d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); dst0 += stride; d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); dst1 += stride; d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); dst0 += stride; d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); dst1 += stride; d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); dst0 += stride; d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); dst1 += stride; d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); q2 = vmulq_s16(q2, q0); q3 = vmulq_s16(q3, q1); q4 = vmulq_s16(q4, q0); q5 = vmulq_s16(q5, q1); // vswp dLow0 = vget_low_s16(q2); dHigh0 = vget_high_s16(q2); dLow1 = vget_low_s16(q4); dHigh1 = vget_high_s16(q4); q2 = vcombine_s16(dLow0, dLow1); q4 = vcombine_s16(dHigh0, dHigh1); dLow0 = vget_low_s16(q3); dHigh0 = vget_high_s16(q3); dLow1 = vget_low_s16(q5); dHigh1 = vget_high_s16(q5); q3 = vcombine_s16(dLow0, dLow1); q5 = vcombine_s16(dHigh0, dHigh1); q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); q10 = vqaddq_s16(q2, q3); q11 = vqsubq_s16(q2, q3); q8 = vshrq_n_s16(q8, 1); q9 = vshrq_n_s16(q9, 1); q4 = vqaddq_s16(q4, q8); q5 = vqaddq_s16(q5, q9); q2 = vqsubq_s16(q6, q5); q3 = vqaddq_s16(q7, q4); q4 = vqaddq_s16(q10, q3); q5 = vqaddq_s16(q11, q2); q6 = vqsubq_s16(q11, q2); q7 = vqsubq_s16(q10, q3); q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), vreinterpretq_s16_s32(q2tmp1.val[0])); q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), vreinterpretq_s16_s32(q2tmp1.val[1])); // loop 2 q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); q10 = vshrq_n_s16(q10, 1); q11 = vshrq_n_s16(q11, 1); q10 = vqaddq_s16(q2tmp2.val[1], q10); q11 = vqaddq_s16(q2tmp3.val[1], q11); q8 = vqsubq_s16(q8, q11); q9 = vqaddq_s16(q9, q10); q4 = vqaddq_s16(q2, q9); q5 = vqaddq_s16(q3, q8); q6 = vqsubq_s16(q3, q8); q7 = vqsubq_s16(q2, q9); q4 = vrshrq_n_s16(q4, 3); q5 = vrshrq_n_s16(q5, 3); q6 = vrshrq_n_s16(q6, 3); q7 = vrshrq_n_s16(q7, 3); q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), vreinterpretq_s16_s32(q2tmp1.val[0])); q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), vreinterpretq_s16_s32(q2tmp1.val[1])); q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); dst0 = dst; dst1 = dst + 4; vst1_lane_s32((int32_t *)dst0, d28, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d28, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d29, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d29, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d30, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d30, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d31, 0); vst1_lane_s32((int32_t *)dst1, d31, 1); return; }