void test_vqmovns32 (void)
{
  int16x4_t out_int16x4_t;
  int32x4_t arg0_int32x4_t;

  out_int16x4_t = vqmovn_s32 (arg0_int32x4_t);
}
static inline uint8x16_t condense_float_rgbas(float32x4_t rgba0,
					      float32x4_t rgba1,  
					      float32x4_t rgba2,
					      float32x4_t rgba3)
{
  uint8x16_t retval = {0};  /* 16 bytes as 4 4-byte RGBAs  */
  int32x4_t i32pixels0, i32pixels1, i32pixels2, i32pixels3;
  int16x4_t i16pixels0, i16pixels1, i16pixels2, i16pixels3;
  int16x8_t i16pixels01, i16pixels23;
  uint8x8_t u8pixels0, u8pixels1;
  
  /* the choice of saturating conversions here will turn the elements  */
  /* of the rgbaN vectors into unsigned chars (0 - 255), so no max/min  */
  /* is required here.   */


  /* first float to int  */
  i32pixels0 = vcvtq_s32_f32(rgba0);
  i32pixels1 = vcvtq_s32_f32(rgba1);
  i32pixels2 = vcvtq_s32_f32(rgba2);
  i32pixels3 = vcvtq_s32_f32(rgba3);

  /* then int to short  */
  i16pixels0 = vqmovn_s32(i32pixels0);
  i16pixels1 = vqmovn_s32(i32pixels1);
  i16pixels2 = vqmovn_s32(i32pixels2);
  i16pixels3 = vqmovn_s32(i32pixels3);
  
  i16pixels01 = vcombine_s16(i16pixels0, i16pixels1);
  i16pixels23 = vcombine_s16(i16pixels2, i16pixels3);

  /* now short to unsigned int. saturation takes care of the boundary cases  */
  u8pixels0 = vqmovun_s16(i16pixels01);
  u8pixels1 = vqmovun_s16(i16pixels23);
  
  retval = vcombine_u8(u8pixels0, u8pixels1);

  return(retval);
}
Beispiel #3
0
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) {
#ifdef _M_SSE
	// Size will always be 16-byte aligned as the hwBlockSize is.
	while (size >= 8) {
		__m128i in1 = _mm_loadu_si128((__m128i *)in);
		__m128i in2 = _mm_loadu_si128((__m128i *)(in + 4));
		__m128i packed = _mm_packs_epi32(in1, in2);
		if (useShift) {
			packed = _mm_srai_epi16(packed, volShift);
		}
		_mm_storeu_si128((__m128i *)out, packed);
		out += 8;
		in += 8;
		size -= 8;
	}
#elif PPSSPP_ARCH(ARM_NEON)
	int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer
	while (size >= 8) {
		int32x4_t in1 = vld1q_s32(in);
		int32x4_t in2 = vld1q_s32(in + 4);
		int16x4_t packed1 = vqmovn_s32(in1);
		int16x4_t packed2 = vqmovn_s32(in2);
		if (useShift) {
			packed1 = vshl_s16(packed1, signedVolShift);
			packed2 = vshl_s16(packed2, signedVolShift);
		}
		vst1_s16(out, packed1);
		vst1_s16(out + 4, packed2);
		out += 8;
		in += 8;
		size -= 8;
	}
#endif
	// This does the remainder if SIMD was used, otherwise it does it all.
	for (size_t i = 0; i < size; i++) {
		out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]);
	}
}
Beispiel #4
0
// Update the noise estimation information.
static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) {
  const int16_t kExp2Const = 11819; // Q13
  int16_t* ptr_noiseEstLogQuantile = NULL;
  int16_t* ptr_noiseEstQuantile = NULL;
  int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
  int32x4_t twentyOne32x4 = vdupq_n_s32(21);
  int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
  int32x4_t constB32x4 = vdupq_n_s32(0x200000);

  int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
                                        inst->magnLen);

  // Guarantee a Q-domain as high as possible and still fit in int16
  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
                                                                 tmp16,
                                                                 21);

  int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);

  for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
       ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
       ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
       ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {

    // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i];
    int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
    int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);

    // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
    v32x4A = vorrq_s32(v32x4A, constB32x4);

    // tmp16 = (int16_t)(tmp32no2 >> 21);
    v32x4B = vshrq_n_s32(v32x4B, 21);

    // tmp16 -= 21;// shift 21 to get result in Q0
    v32x4B = vsubq_s32(v32x4B, twentyOne32x4);

    // tmp16 += (int16_t) inst->qNoise;
    // shift to get result in Q(qNoise)
    v32x4B = vaddq_s32(v32x4B, qNoise32x4);

    // if (tmp16 < 0) {
    //   tmp32no1 >>= -tmp16;
    // } else {
    //   tmp32no1 <<= tmp16;
    // }
    v32x4B = vshlq_s32(v32x4A, v32x4B);

    // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
    v16x4 = vqmovn_s32(v32x4B);

    //inst->noiseEstQuantile[i] = tmp16;
    vst1_s16(ptr_noiseEstQuantile, v16x4);
  }

  // Last iteration:

  // inst->quantile[i]=exp(inst->lquantile[offset+i]);
  // in Q21
  int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile;
  int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac

  tmp16 = (int16_t)(tmp32no2 >> 21);
  tmp16 -= 21;// shift 21 to get result in Q0
  tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
  if (tmp16 < 0) {
    tmp32no1 >>= -tmp16;
  } else {
Beispiel #5
0
inline int16x4_t   vqmovn(const int32x4_t   & v) { return vqmovn_s32(v); }