void test_vget_lowu32 (void) { register uint32x2_t out_uint32x2_t asm ("d18"); uint32x4_t arg0_uint32x4_t; out_uint32x2_t = vget_low_u32 (arg0_uint32x4_t); }
uint32x2_t FORCE_INLINE popcnt_neon_qreg(const uint8x16_t reg) { const uint8x16_t pcnt = vcntq_u8(reg); const uint16x8_t t0 = vpaddlq_u8(pcnt); const uint32x4_t t1 = vpaddlq_u16(t0); const uint32x2_t t2 = vadd_u32(vget_low_u32(t1), vget_high_u32(t1)); return t2; }
static inline void AddLanes(uint32_t* ptr, uint32x4_t v) { #if defined(WEBRTC_ARCH_ARM64) *(ptr) = vaddvq_u32(v); #else uint32x2_t tmp_v; tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v)); tmp_v = vpadd_u32(tmp_v, tmp_v); *(ptr) = vget_lane_u32(tmp_v, 0); #endif }
int crypto_stream_xor( unsigned char *c, const unsigned char *m,unsigned long long mlen, const unsigned char *n, const unsigned char *k ) { const uint32x4_t abab = {-1,0,-1,0}; const uint64x1_t nextblock = {1}; uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k); uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16)); uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma); uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n); uint32x2_t n2n3 = {0,0}; uint32x2_t k0k1 = vget_low_u32(k0k1k2k3); uint32x2_t k2k3 = vget_high_u32(k0k1k2k3); uint32x2_t k4k5 = vget_low_u32(k4k5k6k7); uint32x2_t k6k7 = vget_high_u32(k4k5k6k7); uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1); uint32x2_t n3n2; uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1); uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1); uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1); uint32x2_t n2k6; uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1); uint32x2_t k3n3; uint32x4_t start1 = vcombine_u32(k5k0,n0k4); uint32x4_t start2; uint32x4_t start3; register uint32x4_t diag0; register uint32x4_t diag1; register uint32x4_t diag2; register uint32x4_t diag3; uint32x4_t next_start2; uint32x4_t next_start3; register uint32x4_t next_diag0; register uint32x4_t next_diag1; register uint32x4_t next_diag2; register uint32x4_t next_diag3; uint32x4_t x0x5x10x15; uint32x4_t x12x1x6x11; uint32x4_t x8x13x2x7; uint32x4_t x4x9x14x3; uint32x4_t x0x1x10x11; uint32x4_t x12x13x6x7; uint32x4_t x8x9x2x3; uint32x4_t x4x5x14x15; uint32x4_t x0x1x2x3; uint32x4_t x4x5x6x7; uint32x4_t x8x9x10x11; uint32x4_t x12x13x14x15; uint32x4_t m0m1m2m3; uint32x4_t m4m5m6m7; uint32x4_t m8m9m10m11; uint32x4_t m12m13m14m15; register uint32x4_t a0; register uint32x4_t a1; register uint32x4_t a2; register uint32x4_t a3; register uint32x4_t b0; register uint32x4_t b1; register uint32x4_t b2; register uint32x4_t b3; register uint32x4_t next_a0; register uint32x4_t next_a1; register uint32x4_t next_a2; register uint32x4_t next_a3; register uint32x4_t next_b0; register uint32x4_t next_b1; register uint32x4_t next_b2; register uint32x4_t next_b3; unsigned char block[64]; unsigned char *savec; int i; int flagm = (m != 0); if (!mlen) return 0; if (mlen < 128) goto mlenatleast1; mlenatleast128: n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); next_start2 = vcombine_u32(n2k6,k1n1); next_start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); next_diag0 = start0; next_diag1 = start1; next_diag2 = next_start2; next_diag3 = next_start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag1 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag3 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag3; next_diag3 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag3; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag3 + next_diag2; next_diag3 = vextq_u32(next_diag3,next_diag3,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag1 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); next_diag1 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); next_a3 = next_diag2 + next_diag1; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag1 = vextq_u32(next_diag1,next_diag1,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); a0 = diag3 + diag0; next_diag0 ^= next_a3; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag3 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag1 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag1; next_diag1 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag1; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag1 + next_diag2; next_diag1 = vextq_u32(next_diag1,next_diag1,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag3 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); next_diag3 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); next_a3 = next_diag2 + next_diag3; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag3 = vextq_u32(next_diag3,next_diag3,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); next_diag0 ^= next_a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); x0x5x10x15 = next_diag0 + start0; x12x1x6x11 = next_diag1 + start1; x8x13x2x7 = next_diag2 + next_start2; x4x9x14x3 = next_diag3 + next_start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); mlen -= 128; c += 128; if (flagm) m += 128; if (mlen >= 128) goto mlenatleast128; mlenatleast1: if (mlen < 64) { if (flagm) for (i = 0;i < 64;++i) block[i] = 0; if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i]; savec = c; c = block; if (flagm) m = block; } n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag3 ^= a0; a1 = diag0 + diag3; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag1 ^= a2; a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; a0 = diag3 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag1 ^= a0; a1 = diag0 + diag1; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag3 ^= a2; a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); if (mlen < 64) { for (i = 0;i < mlen;++i) savec[i] = c[i]; } if (mlen <= 64) return 0; n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); mlen -= 64; c += 64; if (flagm) m += 64; goto mlenatleast1; }
uint32x2_t test_vget_low_u32(uint32x4_t a) { // CHECK-LABEL: test_vget_low_u32: return vget_low_u32(a); // CHECK-NEXT: ret }
inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); }
KFR_SINTRIN bool bittestany(const u32neon& a) { const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a)); return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; }
KFR_SINTRIN bool bittestall(const u32neon& a) { const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a)); return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; }
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d) { c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val)); d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val)); }
static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9, int16_t* inre2Q9, int32_t* outreQ16, int32_t* outimQ16) { int k; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921. // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code. int32_t fact = 16921 << 5; int32x4_t factq = vdupq_n_s32(fact); uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); int16x8_t inre1 = vld1q_s16(inre1Q9); int16x8_t inre2 = vld1q_s16(inre2Q9); kCosTab += 8; kSinTab += 8; inre1Q9 += 8; inre2Q9 += 8; // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code. int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1)); int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2)); tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2)); tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmp2 = vmull_high_s16(tmpr, inre1); int32x4_t tmp3 = vmull_high_s16(tmpr, inre2); tmp2 = vmlal_high_s16(tmp2, tmpi, inre2); tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1); #else int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1)); int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2)); tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2)); tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1)); #endif int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq); int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq); int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq); int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq); vst1q_s32(outreQ16, outr_0); outreQ16 += 4; vst1q_s32(outreQ16, outr_1); outreQ16 += 4; vst1q_s32(outimQ16, outi_0); outimQ16 += 4; vst1q_s32(outimQ16, outi_1); outimQ16 += 4; // Find the absolute maximum in the vectors. tmp0 = vabsq_s32(outr_0); tmp1 = vabsq_s32(outr_1); tmp2 = vabsq_s32(outi_0); tmp3 = vabsq_s32(outi_1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
size_t mempopcnt(const void *s, size_t len) { uint8x16_t v_0; uint8x16_t c; uint32x4_t v_sum; uint32x2_t v_tsum; unsigned char *p; size_t r; unsigned shift; prefetch(s); // TODO: do this in 64 bit? the mem model seems more that way... v_0 = (uint8x16_t){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; v_sum = (uint32x4_t)v_0; p = (unsigned char *)ALIGN_DOWN(s, SOVUCQ); shift = ALIGN_DOWN_DIFF(s, SOVUCQ); c = *(const uint8x16_t *)p; if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(v_0, c, SOVUCQ - shift); else c = neon_simple_alignq(c, v_0, shift); if(len >= SOVUCQ || len + shift >= SOVUCQ) { p += SOVUCQ; len -= SOVUCQ - shift; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); while(len >= SOVUCQ * 2) { uint8x16_t v_sumb = v_0; r = len / (SOVUCQ * 2); r = r > 15 ? 15 : r; len -= r * SOVUCQ * 2; /* * NEON has a vector popcnt instruction, so no compression. * We trust the speed given in the handbook (adding more * instructions would not make it faster), 1-2 cycles. */ for(; r; r--, p += SOVUCQ * 2) { c = *(const uint8x16_t *)p; v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); c = *((const uint8x16_t *)(p + SOVUCQ)); v_sumb = vaddq_u8(v_sumb, vcntq_u8(c)); } v_sum = vpadalq_u16(v_sum, vpaddlq_u8(v_sumb)); } if(len >= SOVUCQ) { c = *(const uint8x16_t *)p; p += SOVUCQ; v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); len -= SOVUCQ; } if(len) c = *(const uint8x16_t *)p; } if(len) { if(HOST_IS_BIGENDIAN) c = neon_simple_alignq(c, v_0, SOVUCQ - len); else c = neon_simple_alignq(v_0, c, len); v_sum = vpadalq_u16(v_sum, vpaddlq_u8(vcntq_u8(c))); } v_tsum = vpadd_u32(vget_high_u32(v_sum), vget_low_u32(v_sum)); v_tsum = vpadd_u32(v_tsum, v_tsum); return vget_lane_u32(v_tsum, 0); }
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; uint8x16_t q12u8, q13u8, q14u8, q15u8; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; uint16x8x2_t q0x2u16; uint8x8x2_t d0x2u8, d1x2u8; uint32x2x2_t d0x2u32; uint16x4x2_t d0x2u16, d1x2u16; uint32x4x2_t q0x2u32; assert(x_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_y; q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps for (; h > 0; h -= 4, src += src_stride * 4, dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; d25u8 = vld1_u8(s); s += src_stride; d26u8 = vld1_u8(s); s += src_stride; d27u8 = vld1_u8(s); q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); d0x2u8 = vtrn_u8(d24u8, d25u8); d1x2u8 = vtrn_u8(d26u8, d27u8); __builtin_prefetch(src + src_stride * 4); __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); q8u16 = vmovl_u8(d0x2u8.val[0]); q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); d16u16 = vget_low_u16(q8u16); d17u16 = vget_high_u16(q8u16); d18u16 = vget_low_u16(q9u16); d19u16 = vget_high_u16(q9u16); q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d29u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d31u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d30u32 = vld1_dup_u32((const uint32_t *)s); __builtin_prefetch(psrc + 64); d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 vreinterpret_u8_u16(d1x2u16.val[1])); // d30 __builtin_prefetch(psrc + 64 + src_stride); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); q12u16 = vmovl_u8(d28u8); q13u16 = vmovl_u8(d29u8); __builtin_prefetch(psrc + 64 + src_stride * 2); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, q0s16); q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, q0s16); q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, q0s16); q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), vreinterpret_u8_u32(d0x2u32.val[1])); d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); d = pdst; vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); q8u16 = q9u16; d20s16 = d23s16; q11u16 = q12u16; q9u16 = q13u16; d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); } } return; }
// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 { // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1> // CHECK: ret <2 x i32> [[SHUFFLE_I]] uint32x2_t test_vget_low_u32(uint32x4_t a) { return vget_low_u32(a); }
int crypto_core( unsigned char *out, const unsigned char *in, const unsigned char *k, const unsigned char *c ) { int i; const uint32x4_t abab = {-1,0,-1,0}; uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k); uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16)); uint32x4_t c0c1c2c3 = (uint32x4_t) vld1q_u8((uint8_t *) c); uint32x4_t n0n1n2n3 = (uint32x4_t) vld1q_u8((uint8_t *) in); uint32x4_t n1n2n3n0 = vextq_u32(n0n1n2n3,n0n1n2n3,1); uint32x2_t n1n2 = vget_low_u32(n1n2n3n0); uint32x2_t n3n0 = vget_high_u32(n1n2n3n0); uint32x2_t k0k1 = vget_low_u32(k0k1k2k3); uint32x2_t k2k3 = vget_high_u32(k0k1k2k3); uint32x2_t k4k5 = vget_low_u32(k4k5k6k7); uint32x2_t k6k7 = vget_high_u32(k4k5k6k7); uint32x2_t n0k0 = vext_u32(n3n0,k0k1,1); uint32x2_t k0n0 = vext_u32(n0k0,n0k0,1); uint32x4_t k4k5k0n0 = vcombine_u32(k4k5,k0n0); uint32x2_t k1k6 = vext_u32(k0k1,k6k7,1); uint32x2_t k6k1 = vext_u32(k1k6,k1k6,1); uint32x4_t n1n2k6k1 = vcombine_u32(n1n2,k6k1); uint32x2_t k7n3 = vext_u32(k6k7,n3n0,1); uint32x2_t n3k7 = vext_u32(k7n3,k7n3,1); uint32x4_t k2k3n3k7 = vcombine_u32(k2k3,n3k7); uint32x4_t start0 = c0c1c2c3; uint32x4_t start1 = vextq_u32(k4k5k0n0,k4k5k0n0,1); uint32x4_t start2 = vextq_u32(n1n2k6k1,n1n2k6k1,1); uint32x4_t start3 = vextq_u32(k2k3n3k7,k2k3n3k7,1); uint32x4_t diag0 = start0; uint32x4_t diag1 = start1; uint32x4_t diag2 = start2; uint32x4_t diag3 = start3; uint32x4_t x0x5x10x15; uint32x4_t x12x1x6x11; uint32x4_t x8x13x2x7; uint32x4_t x4x9x14x3; uint32x4_t x0x1x10x11; uint32x4_t x12x13x6x7; uint32x4_t x8x9x2x3; uint32x4_t x4x5x14x15; uint32x4_t x0x1x2x3; uint32x4_t x4x5x6x7; uint32x4_t x8x9x10x11; uint32x4_t x12x13x14x15; uint32x4_t a0; uint32x4_t a1; uint32x4_t a2; uint32x4_t a3; for (i = ROUNDS; i > 0; i -= 2) { a0 = diag1 + diag0; diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); a1 = diag0 + diag3; diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); a2 = diag3 + diag2; diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); a3 = diag2 + diag1; diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); diag3 = vextq_u32(diag3,diag3,3); diag2 = vextq_u32(diag2,diag2,2); diag1 = vextq_u32(diag1,diag1,1); a0 = diag3 + diag0; diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); a1 = diag0 + diag1; diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); a2 = diag1 + diag2; diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); a3 = diag2 + diag3; diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); diag1 = vextq_u32(diag1,diag1,3); diag2 = vextq_u32(diag2,diag2,2); diag3 = vextq_u32(diag3,diag3,1); } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); vst1q_u8((uint8_t *) out,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) out,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) out,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) out,(uint8x16_t) x12x13x14x15); return 0; }