int main () { int v; long long vll; v = _add2 (a, b); if (v != 0x1000f000) abort (); v = _sub2 (a, b); if (v != 0x9000b000) abort (); v = _sub2 (b, a); if (v != 0x70005000) abort (); v = _add4 (a4, b4); if (v != 0x10f02000) abort (); v = _sub4 (a4, b4); if (v != 0x90b04000) abort (); v = _saddu4 (a4, c4); if (v != 0xfff050ff) abort (); v = _sadd2 (a, b); if (v != 0x1000f000) abort (); v = _sadd2 (a, c); if (v != 0x7fff8000) abort (); v = _ssub2 (a, b); if (v != 0x7fffb000) abort (); v = _ssub2 (b, a); if (v != 0x80005000) abort (); vll = _smpy2ll (a, b); if (vll != 0xd8000000f4000000ll) abort (); vll = _smpy2ll (d, d); if (vll != 0x7fffffff00000002ll) abort (); v = _avg2 (b, e); if (v != 0x08002001) abort (); v = _avgu4 (d4, e4); if (v != 0x88102980) abort (); v = _abs2 (a); if (v != 0x50003000) abort (); v = _abs2 (f); if (v != 0x40007fff) abort (); return 0; }
void DSP_QMFA_process(DSP_QMFA_bank_t *QMFA_bank_obj) { sint32 i,j; uint32 * restrict filter = (uint32 *)QMFA_bank_obj->flt_ptr; uint32 * restrict data_ptr = (uint32 *)(QMFA_bank_obj->data_in_buffer + 0); /*input */ uint32 L = QMFA_bank_obj->blk_len/2; uint32 M = FILT_LEN/2; uint32 * restrict hist_rd_ptr0 = (uint32 *)QMFA_bank_obj->history[0]; uint32 * restrict hist_rd_ptr1 = (uint32 *)QMFA_bank_obj->history[1]; uint32 * restrict hist_wr_ptr0 = (uint32 *)QMFA_bank_obj->history[0]; uint32 * restrict hist_wr_ptr1 = (uint32 *)QMFA_bank_obj->history[1]; uint32 * restrict data_out_lo_ptr = (uint32 *)QMFA_bank_obj->data_out_LO_ptr; uint32 * restrict data_out_hi_ptr = (uint32 *)QMFA_bank_obj->data_out_HI_ptr; for(i=0;i<L;++i) { /*Iterations 0 to (L-1) produce the current block of output*/ register uint32 o0,o1,out0,out1; register __int40_t accum0l,accum1l,accum0r,accum1r; /*accumulators for polyphase filter commponent 0 and 1*/ register long long temp; accum0l = 0;accum0r = 0; accum1l = 0;accum1r = 0; uint32 *dptr = data_ptr + 2*i + 1; uint32 *fptr = filter; #pragma MUST_ITERATE( 1) for(j=0;j<QMFA_bank_obj->iter_count[i];++j) { /* Limit for j is beginning at the rightmost overlap position to the leftmost overlap position*/ register uint32 d0,d1; register uint32 filt_coef10 = *fptr++; /*Loads the decimated filter coeficients*/ register uint32 filt_coef0 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register uint32 filt_coef1 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr--;//data_ptr1[(2*(i-j))]; /*load the data right to left with 1:2 split*/ d0 = *dptr--;//data_ptr0[(2*(i-j))]; /*load the data right to left with 1:2 split*/ temp = _mpy2ll(d0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(d1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } o0 = _pack2( _sat((accum0r + 16384)>>15), _sat((accum0l + 16384)>>15)); o1 = _pack2( _sat((accum1r + 16384)>>15), _sat((accum1l + 16384)>>15)); o0 = _add2(o0, hist_rd_ptr0[i]); /*add history sample filter 0 overlap add*/ o1 = _add2(o1, hist_rd_ptr1[i]); /*add history sample filter 1 overlap add*/ out0 = _add2(o0,o1) ; data_out_lo_ptr[i] = out0; /*out bank0 is out0 + out1*/ out1 = _sub2(o0,o1) ; data_out_hi_ptr[i] = out1; /*out bank1 is out0 - out1*/ } for(i=L;i<L+M-1;++i) { /*Iterations L to (L + (M-1) -1) produce the history for overlap add for next block*/ register uint32 o0,o1; register __int40_t accum0l,accum1l,accum0r,accum1r; /*accumulators for polyphase filter commponent 0 and 1*/ register long long temp; accum0l = 0;accum0r = 0; accum1l = 0;accum1r = 0; uint32 *dptr = data_ptr + 2*(L-1) + 1; uint32 *fptr = filter + (i-L+1); #pragma MUST_ITERATE( 1) for(j=0;j<QMFA_bank_obj->iter_count[i];++j) { /*same logic for j starting at the rightmost point of overlap to leftmost*/ register sint32 d0,d1; register sint32 filt_coef10 = *fptr++;; /*Loads the decimated filter coeficients*/ register sint32 filt_coef0 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register sint32 filt_coef1 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr--; /*load the data right to left with 1:2 split*/ d0 = *dptr--; /*load the data right to left with 1:2 split*/ temp = _mpy2ll(d0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(d1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } o0 = _pack2( _sat((accum0r + 16384)>>15), _sat((accum0l + 16384)>>15)); o1 = _pack2( _sat((accum1r + 16384)>>15), _sat((accum1l + 16384)>>15)); o0 = _add2(o0, hist_rd_ptr0[i]); /*add history sample filter 0 overlap add*/ o1 = _add2(o1, hist_rd_ptr1[i]); /*add history sample filter 1 overlap add*/ hist_wr_ptr0[(i-L)] = o0; /* write out overlap add history history filter 0*/ hist_wr_ptr1[(i-L)] = o1; /* write out overlap add history history filter 1*/ } return; }
void chroma_sample_interpolation_TI(unsigned char image_Cb [RESTRICT], unsigned char image_Cr [RESTRICT] , unsigned char refPicLXCb[RESTRICT], unsigned char refPicLXCr[RESTRICT] , const short xFracl, const short yFracl, const short PicWidthSamples,const short stride) { unsigned char* pucCbPtrA = refPicLXCb; unsigned char* pucCbPtrB = refPicLXCb + 1; unsigned char* pucCbPtrC = refPicLXCb + PicWidthSamples; unsigned char* pucCbPtrD = refPicLXCb + PicWidthSamples + 1; unsigned char* pucCrPtrE = refPicLXCr; unsigned char* pucCrPtrF = refPicLXCr + 1; unsigned char* pucCrPtrG = refPicLXCr + PicWidthSamples; unsigned char* pucCrPtrH = refPicLXCr + PicWidthSamples + 1; unsigned char* pucOutputCbPtr = image_Cb; unsigned char* pucOutputCrPtr = image_Cr; unsigned int uiTmp1,uiTmp2; unsigned int ui1_1,ui1_2,ui2_1,ui2_2,res_1,res_2,res_3,res_4; unsigned int tmpend1_1,tmpend1_2,tmpend2_1,tmpend2_2; unsigned int uiA,uiB,uiC,uiD; unsigned int uiE,uiF,uiG,uiH; unsigned int uicst = xFracl * yFracl; uiTmp1 = _pack2(uicst,uicst); uiTmp2 = (_pack2(xFracl,yFracl)) << 3; uiTmp2 = _sub2(uiTmp2,uiTmp1); uiTmp1 = (uicst) + ((uicst - ((xFracl + yFracl) <<3) + 64) << 16); uicst = _packh2(uiTmp1,uiTmp2); // cst2 cst3 uiTmp1 = _pack2(uiTmp2,uiTmp1); // cst4 cst1 uicst = _spacku4(uicst,uiTmp1); uiA = _mem2(pucCbPtrA); uiB = _mem2(pucCbPtrB); uiC = _mem2(pucCbPtrC); uiD = _mem2(pucCbPtrD); uiE = _mem2(pucCrPtrE); uiF = _mem2(pucCrPtrF); uiG = _mem2(pucCrPtrG); uiH = _mem2(pucCrPtrH); pucCbPtrA += PicWidthSamples; pucCbPtrB += PicWidthSamples; pucCbPtrC += PicWidthSamples; pucCbPtrD += PicWidthSamples; pucCrPtrE += PicWidthSamples; pucCrPtrF += PicWidthSamples; pucCrPtrG += PicWidthSamples; pucCrPtrH += PicWidthSamples; uiA += (_mem2(pucCbPtrA) << 16); uiB += (_mem2(pucCbPtrB) << 16); uiC += (_mem2(pucCbPtrC) << 16); uiD += (_mem2(pucCbPtrD) << 16); uiE += (_mem2(pucCrPtrE) << 16); uiF += (_mem2(pucCrPtrF) << 16); uiG += (_mem2(pucCrPtrG) << 16); uiH += (_mem2(pucCrPtrH) << 16); uiTmp1 = _packh4(uiA,uiB); uiTmp2 = _packh4(uiC,uiD); ui1_1 = _packh4(uiTmp1,uiTmp2); ui2_1 = _packl4(uiTmp1,uiTmp2); uiTmp1 = _packl4(uiA,uiB); uiTmp2 = _packl4(uiC,uiD); ui1_2 = _packh4(uiTmp1,uiTmp2); ui2_2 = _packl4(uiTmp1,uiTmp2); tmpend1_1 = _dotpu4(uicst,ui1_1); tmpend1_2 = _dotpu4(uicst,ui1_2); tmpend2_1 = _dotpu4(uicst,ui2_1); tmpend2_2 = _dotpu4(uicst,ui2_2); res_1 = _pack2(tmpend1_1,tmpend1_2); res_2 = _pack2(tmpend2_1,tmpend2_2); res_1 = _shr2(_sadd2(res_1,0x00200020),6); res_2 = _shr2(_sadd2(res_2,0x00200020),6); res_1 = _spacku4(0x00000000,res_1); res_2 = _spacku4(0x00000000,res_2); _mem2(pucOutputCbPtr) = res_2; pucOutputCbPtr += stride; _mem2(pucOutputCbPtr) = res_1; uiTmp1 = _packh4(uiE,uiF); uiTmp2 = _packh4(uiG,uiH); ui1_1 = _packh4(uiTmp1,uiTmp2); ui2_1 = _packl4(uiTmp1,uiTmp2); uiTmp1 = _packl4(uiE,uiF); uiTmp2 = _packl4(uiG,uiH); ui1_2 = _packh4(uiTmp1,uiTmp2); ui2_2 = _packl4(uiTmp1,uiTmp2); tmpend1_1 = _dotpu4(uicst,ui1_1); tmpend1_2 = _dotpu4(uicst,ui1_2); tmpend2_1 = _dotpu4(uicst,ui2_1); tmpend2_2 = _dotpu4(uicst,ui2_2); res_3 = _pack2(tmpend1_1,tmpend1_2); res_4 = _pack2(tmpend2_1,tmpend2_2); res_3 = _shr2(_sadd2(res_3,0x00200020),6); res_4 = _shr2(_sadd2(res_4,0x00200020),6); res_3 = _spacku4(0x00000000,res_3); res_4 = _spacku4(0x00000000,res_4); _mem2(pucOutputCrPtr) = res_4; pucOutputCrPtr += stride; _mem2(pucOutputCrPtr) = res_3; }
void DSP_QMFS_process(DSP_QMFS_bank_t *QMFS_bank_obj) { sint32 i,j; uint32 * restrict filter = (uint32 *)QMFS_bank_obj->flt_ptr; uint32 * restrict data_ptr0 = (uint32 *)QMFS_bank_obj->data_in_buffer_LO; /*input buffer bank0*/ uint32 * restrict data_ptr1 = (uint32 *)QMFS_bank_obj->data_in_buffer_HI; /*input buffer bank0*/ sint32 L = QMFS_bank_obj->blk_len; sint32 M = FILT_LEN/2; uint32 * restrict hist_rd_ptr0 = (uint32 *)QMFS_bank_obj->history[0]; uint32 * restrict hist_rd_ptr1 = (uint32 *)QMFS_bank_obj->history[1]; uint32 * restrict hist_wr_ptr0 = (uint32 *)QMFS_bank_obj->history[0]; uint32 * restrict hist_wr_ptr1 = (uint32 *)QMFS_bank_obj->history[1]; uint32 * restrict dataout_ptr = (uint32 *)QMFS_bank_obj->data_out_ptr; for(i=0;i<L;++i) { /* iteration 0 to (L-1) produces the L outputs of the current block*/ register uint32 t0,t1,out; register __int40_t accum0l, accum1l, accum0r, accum1r; register long long temp; accum0l = 0; accum0r = 0; accum1l = 0; accum1r = 0; uint32 *dptr0 = data_ptr0 + i ; uint32 *dptr1 = data_ptr1 + i ; uint32 *fptr = filter; #pragma MUST_ITERATE( 1) for(j=0; j<QMFS_bank_obj->iter_count[i];++j) { /*j indexes from the point of rightmost overlap to the leftmost overlap position */ register uint32 d0,d1; register uint32 filt_coef10 = *fptr++; /*Loads the decimated filter coeficients*/ register uint32 filt_coef0 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register uint32 filt_coef1 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr1--;//data_ptr1[(2*(i-j))]; /*load the data right to left with 1:2 split*/ d0 = *dptr0--;//data_ptr0[(2*(i-j))]; /*load the data right to left with 1:2 split*/ t0 = _add2(d0,d1); t1 = _sub2(d0,d1); temp = _mpy2ll(t0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(t1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } t0 = _pack2( _sat((accum0l + 16384)>>15), _sat((accum0l + 16384)>>15)); t1 = _pack2( _sat((accum1l + 16384)>>15), _sat((accum1l + 16384)>>15)); out = _add2(t0, hist_rd_ptr0[i]); dataout_ptr[2*i + 0] = out; /*filter 0 produces the odd output*/ out = _add2(t1, hist_rd_ptr1[i]); dataout_ptr[2*i + 1] = out; /*filter 1 produces the even output*/ } for(i=L;i<L+M-1;++i) { register uint32 t0,t1,out; register __int40_t accum0l, accum1l, accum0r, accum1r; register long long temp; accum0l = 0; accum0r = 0; accum1l = 0; accum1r = 0; uint32 *dptr0 = data_ptr0 + (L-1) ; uint32 *dptr1 = data_ptr1 + (L-1) ; uint32 *fptr = filter + (i-L+1); #pragma MUST_ITERATE( 1) for(j=0; j<QMFS_bank_obj->iter_count[i];++j) { /*same logic for j starting at the rightmost point of overlap to leftmost*/ register sint32 d0,d1; register sint32 filt_coef10 = *fptr++;; /*Loads the decimated filter coeficients*/ register sint32 filt_coef0 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register sint32 filt_coef1 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr1--; /*load the data right to left with 1:2 split*/ d0 = *dptr0--; /*load the data right to left with 1:2 split*/ t0 = _add2(d0,d1); t1 = _sub2(d0,d1); temp = _mpy2ll(t0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(t1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } t0 = _pack2( _sat((accum0l + 16384)>>15), _sat((accum0l + 16384)>>15)); t1 = _pack2( _sat((accum1l + 16384)>>15), _sat((accum1l + 16384)>>15)); out = _add2(t0, hist_rd_ptr0[i]); hist_wr_ptr0[(i-L)] = out; /* write out overlap add history history filter 0*/ out = _add2(t1, hist_rd_ptr1[i]); hist_wr_ptr1[(i-L)] = out; /* write out overlap add history history filter 1*/ } return; }