C++ (Cpp) __SMUAD Exemples

Exemple #1

0

Afficher le fichier

Fichier : dsp_decimate.cpp Projet : CCrashBandicot/portapack-hackrf

buffer_c16_t Complex8DecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Decimates by two using a non-recursive third-order CIC filter.
	 */

	/* CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 */

	uint32_t i1_i0 = _i1_i0;
	uint32_t q1_q0 = _q1_q0;

	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	constexpr uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);						// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t d_i0_partial = __SMUAD(k_3_1, i1_i0);			// 1: = 3 * i1 + 1 * i0
		const uint32_t i3_i2 = __SXTB16(q3_i3_q2_i2,  0);				// 1: (q3_i3_q2_i2 ror  0)[23:16]:(q3_i3_q2_i2 ror  0)[7:0]
		const uint32_t d_i0 = __SMLADX(k_3_1, i3_i2, d_i0_partial);		// 1: + 3 * i2 + 1 * i3

		const uint32_t d_q0_partial = __SMUAD(k_3_1, q1_q0);			// 1: = 3 * q1 * 1 * q0
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);				// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t d_q0 = __SMLADX(k_3_1, q3_q2, d_q0_partial);		// 1: + 3 * q2 + 1 * q3 

		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);				// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t d_i1_partial = __SMUAD(k_3_1, i3_i2);			// 1: = 3 * i3 + 1 * i2
		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);				// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t d_i1 = __SMLADX(k_3_1, i5_i4, d_i1_partial);		// 1: + 1 * i5 + 3 * i4

		const uint32_t d_q1_partial = __SMUAD(k_3_1, q3_q2);			// 1: = 3 * q3 * 1 * q2
		const uint32_t q5_q4 = __SXTB16(q5_i5_q4_i4,  8);				// 1: (q5_i5_q4_i4 ror  8)[23:16]:(q5_i5_q4_i4 ror  8)[7:0]
		const uint32_t d_q1 = __SMLADX(k_3_1, q5_q4, d_q1_partial);		// 1: + 1 * q5 + 3 * q4 

		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);				// 1: (Rm<<16)[31:16]:Rn[15:0]

		*(dst_p++) = d_q0_i0;											// 3
		*(dst_p++) = d_q1_i1;

		i1_i0 = i5_i4;
		q1_q0 = q5_q4;
	}
	_i1_i0 = i1_i0;
	_q1_q0 = q1_q0;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}

Exemple #2

0

Afficher le fichier

Fichier : channel_stats_collector.hpp Projet : fengmushu/portapack-hackrf

	void feed(buffer_c16_t src, Callback callback) {
		auto src_p = src.p;
		while(src_p < &src.p[src.count]) {
			const uint32_t sample = *__SIMD32(src_p)++;
			const uint32_t mag_sq = __SMUAD(sample, sample);
			if( mag_sq > max_squared ) {
				max_squared = mag_sq;
			}
		}
		count += src.count;

		const size_t samples_per_update = src.sampling_rate * update_interval;

		if( count >= samples_per_update ) {
			const float max_squared_f = max_squared;
			const float max_db_f = complex16_mag_squared_to_dbv_norm(max_squared_f);
			const int32_t max_db = max_db_f;
			const ChannelStatistics statistics {
				.max_db = max_db,
				.count = count,
			};
			callback(statistics);

			max_squared = 0;
			count = 0;
		}
	}

Exemple #3

0

Afficher le fichier

Fichier : dsp_demodulate.cpp Projet : CCrashBandicot/portapack-hackrf

buffer_f32_t AM::execute(
	const buffer_c16_t& src,
	const buffer_f32_t& dst
) {
	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const uint32_t sample0 = *__SIMD32(src_p)++;
		const uint32_t sample1 = *__SIMD32(src_p)++;
		const uint32_t mag_sq0 = __SMUAD(sample0, sample0);
		const uint32_t mag_sq1 = __SMUAD(sample1, sample1);
		*(dst_p++) = __builtin_sqrtf(mag_sq0) * k;
		*(dst_p++) = __builtin_sqrtf(mag_sq1) * k;
	}

	return { dst.p, src.count, src.sampling_rate };
}

Exemple #4

0

Afficher le fichier

Fichier : channel_stats_collector.hpp Projet : CCrashBandicot/portapack-hackrf

	void feed(const buffer_c16_t& src, Callback callback) {
		auto src_p = src.p;
		while(src_p < &src.p[src.count]) {
			const uint32_t sample = *__SIMD32(src_p)++;
			const uint32_t mag_sq = __SMUAD(sample, sample);
			if( mag_sq > max_squared ) {
				max_squared = mag_sq;
			}
		}
		count += src.count;

		const size_t samples_per_update = src.sampling_rate * update_interval;

		if( count >= samples_per_update ) {
			const float max_squared_f = max_squared;
			const int32_t max_db = mag2_to_dbv_norm(max_squared_f * (1.0f / (32768.0f * 32768.0f)));
			callback({ max_db, count });

			max_squared = 0;
			count = 0;
		}
	}

Exemple #5

0

Afficher le fichier

Fichier : arm_biquad_cascade_df1_fast_q15.c Projet : imhoffj/RaceCapture-Pro_firmware

void arm_biquad_cascade_df1_fast_q15(
    const arm_biquad_casd_df1_inst_q15 * S,
    q15_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q15_t *pIn = pSrc;                             /*  Source pointer                               */
    q15_t *pOut = pDst;                            /*  Destination pointer                          */
    q31_t in;                                      /*  Temporary variable to hold input value       */
    q31_t out;                                     /*  Temporary variable to hold output value      */
    q31_t b0;                                      /*  Temporary variable to hold bo value          */
    q31_t b1, a1;                                  /*  Filter coefficients                          */
    q31_t state_in, state_out;                     /*  Filter state variables                       */
    q31_t acc0;                                    /*  Accumulator                                  */
    int32_t shift = (int32_t) (15 - S->postShift); /*  Post shift                                   */
    q15_t *pState = S->pState;                     /*  State pointer                                */
    q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
    q31_t *pState_q31;                             /*  32-bit state pointer for SIMD implementation */
    uint32_t sample, stage = S->numStages;         /*  Stage loop counter                           */



    do {
        /* Initialize state pointer of type q31 */
        pState_q31 = (q31_t *) (pState);

        /* Read the b0 and 0 coefficients using SIMD  */
        b0 = *__SIMD32(pCoeffs)++;

        /* Read the b1 and b2 coefficients using SIMD */
        b1 = *__SIMD32(pCoeffs)++;

        /* Read the a1 and a2 coefficients using SIMD */
        a1 = *__SIMD32(pCoeffs)++;

        /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
        state_in = (q31_t) (*pState_q31++);

        /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
        state_out = (q31_t) (*pState_q31);

        /* Apply loop unrolling and compute 2 output values simultaneously. */
        /*      The variables acc0 ... acc3 hold output values that are being computed:
         *
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         */
        sample = blockSize >> 1u;

        /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
         ** a second loop below computes the remaining 1 sample. */
        while(sample > 0u) {

            /* Read the input */
            in = *__SIMD32(pIn)++;

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUAD(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
            state_out = __PKHBT(state_out >> 16, (out), 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUADX(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);


            /* Store the output in the destination buffer. */

#ifndef  ARM_MATH_BIG_ENDIAN

            *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);

#else

            *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in >> 16, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */


            /* Decrement the loop counter */
            sample--;

        }

        /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
         ** No loop unrolling is used. */

        if((blockSize & 0x1u) != 0u) {
            /* Read the input */
            in = *pIn++;

            /* out =  b0 * x[n] + 0 * 0 */

#ifndef  ARM_MATH_BIG_ENDIAN

            out = __SMUAD(b0, in);

#else

            out = __SMUADX(b0, in);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Store the output in the destination buffer. */
            *pOut++ = (q15_t) out;

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*   #ifndef  ARM_MATH_BIG_ENDIAN    */

        }

        /*  The first stage goes from the input buffer to the output buffer.  */
        /*  Subsequent (numStages - 1) occur in-place in the output buffer  */
        pIn = pDst;

        /* Reset the output pointer */
        pOut = pDst;

        /*  Store the updated state variables back into the state array */
        *__SIMD32(pState)++ = state_in;
        *__SIMD32(pState)++ = state_out;


        /* Decrement the loop counter */
        stage--;

    } while(stage > 0u);

Exemple #6

0

Afficher le fichier

Fichier : dsp_decimate.cpp Projet : CCrashBandicot/portapack-hackrf

buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Translates incoming complex<int8_t> samples by -fs/4,
	 * decimates by two using a non-recursive third-order CIC filter.
	 */

	/* Derivation of algorithm:
	 * Original CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 *
	 * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication
	 * of complex length-4 sequence:
	 *
	 * Substitute:
	 *	i0 = -i0, q0 = -q0
	 *	i1 = -q1, q1 =  i1
	 *	i2 =  i2, q2 =  q2
	 *	i3 =  q3, q3 = -i3
	 *	i4 = -i4, q4 = -q4
	 *	i5 = -q5, q5 =  i5
	 *
	 * Resulting taps (with decimation by 2, four samples in, two samples out):
	 *	D_I0 =  q3 * 1 +  i2 * 3 + -q1 * 3 + -i0 * 1
	 *	D_Q0 = -i3 * 1 +  q2 * 3 +  i1 * 3 + -q0 * 1
 	 *
	 *	D_I1 = -q5 * 1 + -i4 * 3 +  q3 * 3 +  i2 * 1
	 *	D_Q1 =  i5 * 1 + -q4 * 3 + -i3 * 3 +  q2 * 1
	 */

	// 6 cycles per complex input sample, not including loop overhead.
	uint32_t q1_i0 = _q1_i0;
	uint32_t q0_i1 = _q0_i1;
	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	const uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);			// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16);			// 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0]
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);			// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16);			// 1:(Rm<<16)[31:16]:Rn[15:0]

		// D_I0 = 3 * (i2 - q1) + (q3 - i0)
		const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0);		// 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16]

		// D_Q0 = 3 * (q2 + i1) - (i3 + q0)
		const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1);		// 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0]
		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);			// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24);			// 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0]
		const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		// D_I1 = (i2 - q5) + 3 * (q3 - i4)
		const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1);		// 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0]

		// D_Q1 = (i5 + q2) - 3 * (q4 + i3)
		const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2);		// 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16]
		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]
		*(dst_p++) = d_q0_i0;							// 3
		*(dst_p++) = d_q1_i1;

		q1_i0 = q5_i4;
		q0_i1 = q4_i5;
	}
	_q1_i0 = q1_i0;
	_q0_i1 = q0_i1;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}

Exemple #7

0

Afficher le fichier

Fichier : CV_CoreSimd.c Projet : ARM-software/CMSIS_5

/**
\brief Test case: TC_CoreSimd_ParMul16
\details
- Check Parallel 16-bit multiplication:
  __SMLAD
  __SMLADX
  __SMLALD
  __SMLALDX
  __SMLSD
  __SMLSDX
  __SMLSLD
  __SMLSLDX
  __SMUAD
  __SMUADX
  __SMUSD
  __SMUSDX
*/
void TC_CoreSimd_ParMul16 (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile int32_t op1_s32, op2_s32, op3_s32;
  volatile int32_t res_s32;

  volatile int64_t op1_s64;
  volatile int64_t res_s64;

  /* --- __SMLAD Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x20000000;
  res_s32 = __SMLAD(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x20000017);

  /* --- __SMLADX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLADX(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x00000816);

  /* --- __SMLALD Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLALD(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000017LL);

  /* --- __SMLALDX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLALDX(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000016LL);

  /* --- __SMLSD Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLSD(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x00000809);

  /* --- __SMLSDX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLSDX(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x000007FE);

  /* --- __SMLSLD Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLSLD(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000009LL);

  /* --- __SMLSLDX Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLSLDX(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000012LL);

  /* --- __SMUAD Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUAD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000E);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUAD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF2);

  /* --- __SMUADX Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUADX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000A);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUADX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6);

  /* --- __SMUSD Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x00030001;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000A);

  /* --- __SMUSDX Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUSDX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFFE);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSDX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x00000002);
#endif
}