//--------------------------------------------------- void HiHat_calcSyncBlock(int16_t* buf, const uint8_t size) { //2 buffers for the mod oscs int16_t mod1[size],mod2[size]; //calc next mod osc samples, scaled with mod amount calcNextOscSampleBlock(&hatVoice.modOsc,mod1,size, hatVoice.fmModAmount1); calcNextOscSampleBlock(&hatVoice.modOsc2,mod2,size, hatVoice.fmModAmount2); //combine both mod oscs to 1 modulation signal bufferTool_addBuffersSaturating(mod1,mod2,size); calcNextOscSampleFmBlock(&hatVoice.osc,mod1,buf,size,0.5f) ; SVF_calcBlockZDF(&hatVoice.filter,hatVoice.filterType,buf,size); //calc transient sample transient_calcBlock(&hatVoice.transGen,mod1,size); uint8_t j; if(hatVoice.volumeMod) { for(j=0;j<size;j++) { //add filter to buffer buf[j] = __QADD16(buf[j],mod1[j]); buf[j] *= hatVoice.velo * hatVoice.vol * hatVoice.egValueOscVol; } } else { for(j=0;j<size;j++) { //add filter to buffer buf[j] = __QADD16(buf[j],mod1[j]); buf[j] *= hatVoice.vol * hatVoice.egValueOscVol; } } calcDistBlock(&hatVoice.distortion,buf,size); }
void arm_add_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } }
void arm_pid_init_q15( arm_pid_instance_q15 * S, int32_t resetStateFlag) { #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ /* Derived coefficient A0 */ S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd); /* Derived coefficients and pack into A1 */ #ifndef ARM_MATH_BIG_ENDIAN S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16); #else S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Check whether state needs reset or not */ if(resetStateFlag) { /* Clear the state buffer. The size will be always 3 samples */ memset(S->state, 0, 3u * sizeof(q15_t)); } #else /* Run the below code for Cortex-M0 */ q31_t temp; /*to store the sum */ /* Derived coefficient A0 */ temp = S->Kp + S->Ki + S->Kd; S->A0 = (q15_t) __SSAT(temp, 16); /* Derived coefficients and pack into A1 */ temp = -(S->Kd + S->Kd + S->Kp); S->A1 = (q15_t) __SSAT(temp, 16); S->A2 = S->Kd; /* Check whether state needs reset or not */ if(resetStateFlag) { /* Clear the state buffer. The size will be always 3 samples */ memset(S->state, 0, 3u * sizeof(q15_t)); } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
//--------------------------------------------------- void Snare_calcSyncBlock(int16_t* buf, const uint8_t size) { int16_t transBuf[size]; calcNoiseBlock(&snareVoice.noiseOsc,buf,size,0.9f); SVF_calcBlockZDF(&snareVoice.filter,snareVoice.filterType,buf,size); //calc transient sample transient_calcBlock(&snareVoice.transGen,transBuf,size); bufferTool_addBuffersSaturating(buf,transBuf,size); //calc next osc sample calcNextOscSampleBlock(&snareVoice.osc,transBuf,size,(1.f-snareVoice.mix)); uint8_t j; if(snareVoice.volumeMod) { for(j=0;j<size;j++) { //add filter to buffer buf[j] *= snareVoice.mix; buf[j] = (__QADD16(buf[j],transBuf[j])); buf[j] *= snareVoice.velo * snareVoice.vol * snareVoice.egValueOscVol; } } else { for(j=0;j<size;j++) { //add filter to buffer buf[j] *= snareVoice.mix; buf[j] = (__QADD16(buf[j],transBuf[j])); buf[j] *= snareVoice.vol * snareVoice.egValueOscVol; } } calcDistBlock(&snareVoice.distortion,buf,size); }
//--------------------------------------------------- void Cymbal_calcSyncBlock(int16_t* buf, const uint8_t size) { int16_t mod[size]; int16_t mod2[size]; //calc next mod osc sample calcNextOscSampleBlock(&cymbalVoice.modOsc,mod,size,cymbalVoice.fmModAmount1); calcNextOscSampleBlock(&cymbalVoice.modOsc2,mod2,size,cymbalVoice.fmModAmount2); //combine both mod oscs to 1 modulation signal bufferTool_addBuffersSaturating(mod,mod2,size); calcNextOscSampleFmBlock(&cymbalVoice.osc,mod,buf,size,1.f) ; SVF_calcBlockZDF(&cymbalVoice.filter,cymbalVoice.filterType,buf,size); //calc transient sample transient_calcBlock(&cymbalVoice.transGen,mod,size); uint8_t j; if(cymbalVoice.volumeMod) { for(j=0;j<size;j++) { //add filter to buffer buf[j] = (__QADD16(buf[j],mod[j])) ; buf[j] *= cymbalVoice.velo * cymbalVoice.vol * cymbalVoice.egValueOscVol; } } else { for(j=0;j<size;j++) { //add filter to buffer buf[j] = (__QADD16(buf[j],mod[j])) ; buf[j] *= cymbalVoice.vol * cymbalVoice.egValueOscVol; } } calcDistBlock(&cymbalVoice.distortion,buf,size); }
void arm_offset_q15( q15_t * pSrc, q15_t offset, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t offset_packed; /* Offset packed to 32 bit */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Offset is packed to 32 bit in order to use SIMD32 for addition */ offset_packed = __PKHBT(offset, offset, 16); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer, 2 samples at a time. */ *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __QADD16(*pSrc++, offset); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_add_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ + *pSrcB++), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) { /* Translates incoming complex<int8_t> samples by -fs/4, * decimates by two using a non-recursive third-order CIC filter. */ /* Derivation of algorithm: * Original CIC filter (decimating by two): * D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1 * D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1 * * D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1 * D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1 * * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication * of complex length-4 sequence: * * Substitute: * i0 = -i0, q0 = -q0 * i1 = -q1, q1 = i1 * i2 = i2, q2 = q2 * i3 = q3, q3 = -i3 * i4 = -i4, q4 = -q4 * i5 = -q5, q5 = i5 * * Resulting taps (with decimation by 2, four samples in, two samples out): * D_I0 = q3 * 1 + i2 * 3 + -q1 * 3 + -i0 * 1 * D_Q0 = -i3 * 1 + q2 * 3 + i1 * 3 + -q0 * 1 * * D_I1 = -q5 * 1 + -i4 * 3 + q3 * 3 + i2 * 1 * D_Q1 = i5 * 1 + -q4 * 3 + -i3 * 3 + q2 * 1 */ // 6 cycles per complex input sample, not including loop overhead. uint32_t q1_i0 = _q1_i0; uint32_t q0_i1 = _q0_i1; /* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */ constexpr uint32_t scale_factor = 32; const uint32_t k_3_1 = 0x00030001 * scale_factor; uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]); uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]); uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]); while(src_p < src_end) { const uint32_t q3_i3_q2_i2 = *(src_p++); // 3 const uint32_t q5_i5_q4_i4 = *(src_p++); const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16); // 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0] const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0] const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16); // 1:(Rm<<16)[31:16]:Rn[15:0] // D_I0 = 3 * (i2 - q1) + (q3 - i0) const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0); // 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16] // D_Q0 = 3 * (q2 + i1) - (i3 + q0) const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1); // 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0] const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0] const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24); // 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0] const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] // D_I1 = (i2 - q5) + 3 * (q3 - i4) const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1); // 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0] // D_Q1 = (i5 + q2) - 3 * (q4 + i3) const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2); // 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16] const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] *(dst_p++) = d_q0_i0; // 3 *(dst_p++) = d_q1_i1; q1_i0 = q5_i4; q0_i1 = q4_i5; } _q1_i0 = q1_i0; _q0_i1 = q0_i1; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }
/** \brief Test case: TC_CoreSimd_ParAddSub16 \details - Check Parallel 16-bit addition and subtraction: __SADD16 __SSUB16 __SASX __SSAX __SHADD16 __SHSUB16 __SHASX __SHSAX __QADD16 __QSUB16 __QASX __QSAX __UADD16 __USUB16 __UASX __USAX __UHADD16 __UHSUB16 __UHASX __UHSAX __UQSUB16 __UQADD16 __UQASX __UQSAX */ void TC_CoreSimd_ParAddSub16 (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile uint32_t op1_u32, op2_u32; volatile uint32_t res_u32; volatile int32_t op1_s32, op2_s32; volatile int32_t res_s32; /* --- __SADD16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038001; op2_s32 = (int32_t)0x00040002; res_s32 = __SADD16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80078003); /* --- __SSUB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SSUB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80038001); /* --- __SASX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SASX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80097FFF); /* --- __SSAX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038007; op2_s32 = (int32_t)0x00020004; res_s32 = __SSAX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x7FFF8009); /* --- __SHADD16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038001; op2_s32 = (int32_t)0x00040002; res_s32 = __SHADD16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC003C001); /* --- __SHSUB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SHSUB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC001C000); /* --- __SHASX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SHASX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC004BFFF); /* --- __SHSAX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038007; op2_s32 = (int32_t)0x00020004; res_s32 = __SHSAX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xBFFFC004); /* --- __QADD16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038000; op2_s32 = (int32_t)0x00048002; res_s32 = __QADD16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80078000); /* --- __QSUB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038003; op2_s32 = (int32_t)0x00040002; res_s32 = __QSUB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80008001); /* --- __QASX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __QASX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80098000); /* --- __QSAX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038007; op2_s32 = (int32_t)0x00020004; res_s32 = __QSAX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80008009); /* --- __UADD16 Test ---------------------------------------------- */ op1_u32 = 0x00010002; op2_u32 = 0x00020004; res_u32 = __UADD16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00030006); /* --- __USUB16 Test ---------------------------------------------- */ op1_u32 = 0x00030006; op2_u32 = 0x00020004; res_u32 = __USUB16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00010002); /* --- __UASX Test ---------------------------------------------- */ op1_u32 = 0x80078003; op2_u32 = 0x00040002; res_u32 = __UASX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x80097FFF); /* --- __USAX Test ---------------------------------------------- */ op1_u32 = 0x80038007; op2_u32 = 0x00020004; res_u32 = __USAX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x7FFF8009); /* --- __UHADD16 Test ---------------------------------------------- */ op1_u32 = 0x00010002; op2_u32 = 0x00020004; res_u32 = __UHADD16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00010003); /* --- __UHSUB16 Test ---------------------------------------------- */ op1_u32 = 0x00030006; op2_u32 = 0x00020004; res_u32 = __UHSUB16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00000001); /* --- __UHASX Test ---------------------------------------------- */ op1_u32 = 0x80078003; op2_u32 = 0x00040002; res_u32 = __UHASX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x40043FFF); /* --- __UHSAX Test ---------------------------------------------- */ op1_u32 = 0x80038007; op2_u32 = 0x00020004; res_u32 = __UHSAX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x3FFF4004); /* --- __UQADD16 Test ---------------------------------------------- */ op1_u32 = 0xFFFE0002; op2_u32 = 0x00020004; res_u32 = __UQADD16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0xFFFF0006); /* --- __UQSUB16 Test ---------------------------------------------- */ op1_u32 = 0x00020006; op2_u32 = 0x00030004; res_u32 = __UQSUB16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00000002); /* --- __UQASX Test ---------------------------------------------- */ op1_u32 = 0xFFF80003; op2_u32 = 0x00040009; res_u32 = __UQASX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0xFFFF0000); /* --- __UQSAX Test ---------------------------------------------- */ op1_u32 = 0x0003FFF8; op2_u32 = 0x00090004; res_u32 = __UQSAX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x0000FFFF); #endif }