char GetBoudaryStrenght_TI( short mv_cache_l0[][2], short mv_cache_l1[][2], short *ref_cache_l0, short *ref_cache_l1, short poc_l0[], short poc_l1[], int b_idx, int bn_idx, int slice_type, int mb_type, int mbm_type) { short ref_p1; short ref_q1; short ref_p0 = ref_cache_l0[b_idx] < 0 ? -1 : poc_l0[ref_cache_l0[b_idx]]; short ref_q0 = ref_cache_l0[bn_idx] < 0 ? -1 : poc_l0[ref_cache_l0[bn_idx]]; unsigned int p0q0,p1q1; char v = 1; char x,y; if(slice_type == SLICE_TYPE_B) { ref_p1 = ref_cache_l1[b_idx] < 0 ? -1 : poc_l1[ref_cache_l1[b_idx]]; ref_q1 = ref_cache_l1[bn_idx] < 0 ? -1 : poc_l1[ref_cache_l1[bn_idx]]; }else{ ref_p1 = -1; ref_q1 = -1; } // ref_p1 = ((ref_cache_l1[b_idx] < 0)&&(slice_type != SLICE_TYPE_B)) ? -1 : poc_l1[ref_cache_l1[b_idx]]; // ref_q1 = ((ref_cache_l1[bn_idx] < 0)&&(slice_type != SLICE_TYPE_B)) ? -1 : poc_l1[ref_cache_l1[bn_idx]]; p0q0=_pack2(ref_q1,ref_q0); p1q1=_pack2(ref_p1,ref_p0); x =_cmpeq2(p0q0,p1q1); y = x | _cmpeq2(p0q0,_rotl(p1q1,16)); //if ( ((ref_p0 == ref_q0) && (ref_p1 == ref_q1)) || ((ref_p0 == ref_q1) && (ref_p1 == ref_q0))) { if ( y==3) { char l0b_l0bn=bSCheckMvShort (mv_cache_l0[b_idx], mv_cache_l0[bn_idx]); char l1b_l1bn=bSCheckMvShort (mv_cache_l1[b_idx], mv_cache_l1[bn_idx]); char l0b_l1bn=bSCheckMvShort (mv_cache_l0[b_idx], mv_cache_l1[bn_idx]); char l1b_l0bn=bSCheckMvShort (mv_cache_l1[b_idx], mv_cache_l0[bn_idx]); if (ref_p0 != ref_p1) { // compare MV for the same reference picture if (ref_p0 == ref_q0) { v = l0b_l0bn || (l1b_l1bn && (!(IS_P(mb_type) || IS_P(mbm_type)))); } else { v = (l0b_l1bn) || (l1b_l0bn); } } else { // L0 and L1 reference pictures of p0 are the same; q0 as well v = ( (l0b_l0bn) || (l1b_l1bn)) && ( (l0b_l1bn) || (l1b_l0bn)); } } return v; }
_CODE_ACCESS void *memset(void *dst, int fill, size_t len) { char *restrict dst1, *restrict dst2; int pre_bytes, post_bytes, wfill, i; double dfill1, dfill2; dst1 = (char *)dst; /*--------------------------------------------------------------------*/ /* Replicate the 8-bit value in fill into all 4 bytes of wfill */ /*--------------------------------------------------------------------*/ wfill = _pack2 (fill, fill); wfill = _packl4(wfill, wfill); dfill1 = _itod (wfill, wfill); dfill2 = _itod (wfill, wfill); /*--------------------------------------------------------------------*/ /* Calculate number of bytes to pre-copy to get to an alignment of 8 */ /*--------------------------------------------------------------------*/ pre_bytes = (8 - (int) dst) & 7; if (len > pre_bytes) { len -= pre_bytes; if (pre_bytes & 1) { *dst1 = fill; dst1 += 1; } if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; } if (pre_bytes & 4) { _amem4(dst1) = wfill; dst1 += 4; } } /*--------------------------------------------------------------------*/ /* Double word fills */ /*--------------------------------------------------------------------*/ post_bytes = len > 0 ? len : 0; dst2 = dst1 + 8; if (len > 15) for (i = 0; i < len >> 4; i++) { _amemd8(dst1) = dfill1; dst1 += 16; _amemd8(dst2) = dfill2; dst2 += 16; post_bytes -= 16; } /*--------------------------------------------------------------------*/ /* Finish transfer with 8, 4, 2 and/or 1-byte writes */ /*--------------------------------------------------------------------*/ if (post_bytes & 8) { _memd8(dst1) = dfill1; dst1 += 8; } if (post_bytes & 4) { _mem4 (dst1) = wfill; dst1 += 4; } if (post_bytes & 2) { dst1[0] = wfill; dst1[1] = wfill; dst1 += 2; } if (post_bytes & 1) { *dst1 = fill; dst1 += 1; } return dst; }
_packlh2(_hi(mask_tmp),_hi(mask_tmp))); mask_tmp = _memd8_const((void *) &mask_ptr[1]); mask7_4 = _itod(_packlh2(_lo(mask_tmp),_lo(mask_tmp)), _packlh2(_hi(mask_tmp),_hi(mask_tmp))); /* -------------------------------------------------------------------- */ /* The last mask values of each row are loaded into an int */ /* -------------------------------------------------------------------- */ mask8 = mask_ptr[0]; mask43 = _packlh2(_lo(mask7_4),_hi(mask3_0)); /* -------------------------------------------------------------------- */ /* mask2_88 contains the last mask of row 3 in its half words */ /* -------------------------------------------------------------------- */ mask88 = _pack2(mask8,mask8); /* -------------------------------------------------------------------- */ /* Pack the last mask of row 1 & 2 into a single int variable */ /* -------------------------------------------------------------------- */ mask52 = _packhl2(_lo(mask7_4),_hi(mask3_0)); for (i=0; i<width; i+=2) { /* ------------------------------------------------------------------ */ /* Load 4 pixels at a time from each of the 3 rows using double */ /* word memory accesses. */ /* ------------------------------------------------------------------ */ pix0_3210 = _mem8_const((void *) &imgin_ptr[i]); pix1_3210 = _mem8_const((void *) &imgin_ptr[i + pitch]); pix2_3210 = _mem8_const((void *) &imgin_ptr[i + 2 * pitch]);
void chroma_sample_interpolation_TI(unsigned char image_Cb [RESTRICT], unsigned char image_Cr [RESTRICT] , unsigned char refPicLXCb[RESTRICT], unsigned char refPicLXCr[RESTRICT] , const short xFracl, const short yFracl, const short PicWidthSamples,const short stride) { unsigned char* pucCbPtrA = refPicLXCb; unsigned char* pucCbPtrB = refPicLXCb + 1; unsigned char* pucCbPtrC = refPicLXCb + PicWidthSamples; unsigned char* pucCbPtrD = refPicLXCb + PicWidthSamples + 1; unsigned char* pucCrPtrE = refPicLXCr; unsigned char* pucCrPtrF = refPicLXCr + 1; unsigned char* pucCrPtrG = refPicLXCr + PicWidthSamples; unsigned char* pucCrPtrH = refPicLXCr + PicWidthSamples + 1; unsigned char* pucOutputCbPtr = image_Cb; unsigned char* pucOutputCrPtr = image_Cr; unsigned int uiTmp1,uiTmp2; unsigned int ui1_1,ui1_2,ui2_1,ui2_2,res_1,res_2,res_3,res_4; unsigned int tmpend1_1,tmpend1_2,tmpend2_1,tmpend2_2; unsigned int uiA,uiB,uiC,uiD; unsigned int uiE,uiF,uiG,uiH; unsigned int uicst = xFracl * yFracl; uiTmp1 = _pack2(uicst,uicst); uiTmp2 = (_pack2(xFracl,yFracl)) << 3; uiTmp2 = _sub2(uiTmp2,uiTmp1); uiTmp1 = (uicst) + ((uicst - ((xFracl + yFracl) <<3) + 64) << 16); uicst = _packh2(uiTmp1,uiTmp2); // cst2 cst3 uiTmp1 = _pack2(uiTmp2,uiTmp1); // cst4 cst1 uicst = _spacku4(uicst,uiTmp1); uiA = _mem2(pucCbPtrA); uiB = _mem2(pucCbPtrB); uiC = _mem2(pucCbPtrC); uiD = _mem2(pucCbPtrD); uiE = _mem2(pucCrPtrE); uiF = _mem2(pucCrPtrF); uiG = _mem2(pucCrPtrG); uiH = _mem2(pucCrPtrH); pucCbPtrA += PicWidthSamples; pucCbPtrB += PicWidthSamples; pucCbPtrC += PicWidthSamples; pucCbPtrD += PicWidthSamples; pucCrPtrE += PicWidthSamples; pucCrPtrF += PicWidthSamples; pucCrPtrG += PicWidthSamples; pucCrPtrH += PicWidthSamples; uiA += (_mem2(pucCbPtrA) << 16); uiB += (_mem2(pucCbPtrB) << 16); uiC += (_mem2(pucCbPtrC) << 16); uiD += (_mem2(pucCbPtrD) << 16); uiE += (_mem2(pucCrPtrE) << 16); uiF += (_mem2(pucCrPtrF) << 16); uiG += (_mem2(pucCrPtrG) << 16); uiH += (_mem2(pucCrPtrH) << 16); uiTmp1 = _packh4(uiA,uiB); uiTmp2 = _packh4(uiC,uiD); ui1_1 = _packh4(uiTmp1,uiTmp2); ui2_1 = _packl4(uiTmp1,uiTmp2); uiTmp1 = _packl4(uiA,uiB); uiTmp2 = _packl4(uiC,uiD); ui1_2 = _packh4(uiTmp1,uiTmp2); ui2_2 = _packl4(uiTmp1,uiTmp2); tmpend1_1 = _dotpu4(uicst,ui1_1); tmpend1_2 = _dotpu4(uicst,ui1_2); tmpend2_1 = _dotpu4(uicst,ui2_1); tmpend2_2 = _dotpu4(uicst,ui2_2); res_1 = _pack2(tmpend1_1,tmpend1_2); res_2 = _pack2(tmpend2_1,tmpend2_2); res_1 = _shr2(_sadd2(res_1,0x00200020),6); res_2 = _shr2(_sadd2(res_2,0x00200020),6); res_1 = _spacku4(0x00000000,res_1); res_2 = _spacku4(0x00000000,res_2); _mem2(pucOutputCbPtr) = res_2; pucOutputCbPtr += stride; _mem2(pucOutputCbPtr) = res_1; uiTmp1 = _packh4(uiE,uiF); uiTmp2 = _packh4(uiG,uiH); ui1_1 = _packh4(uiTmp1,uiTmp2); ui2_1 = _packl4(uiTmp1,uiTmp2); uiTmp1 = _packl4(uiE,uiF); uiTmp2 = _packl4(uiG,uiH); ui1_2 = _packh4(uiTmp1,uiTmp2); ui2_2 = _packl4(uiTmp1,uiTmp2); tmpend1_1 = _dotpu4(uicst,ui1_1); tmpend1_2 = _dotpu4(uicst,ui1_2); tmpend2_1 = _dotpu4(uicst,ui2_1); tmpend2_2 = _dotpu4(uicst,ui2_2); res_3 = _pack2(tmpend1_1,tmpend1_2); res_4 = _pack2(tmpend2_1,tmpend2_2); res_3 = _shr2(_sadd2(res_3,0x00200020),6); res_4 = _shr2(_sadd2(res_4,0x00200020),6); res_3 = _spacku4(0x00000000,res_3); res_4 = _spacku4(0x00000000,res_4); _mem2(pucOutputCrPtr) = res_4; pucOutputCrPtr += stride; _mem2(pucOutputCrPtr) = res_3; }
void DSP_QMFA_process(DSP_QMFA_bank_t *QMFA_bank_obj) { sint32 i,j; uint32 * restrict filter = (uint32 *)QMFA_bank_obj->flt_ptr; uint32 * restrict data_ptr = (uint32 *)(QMFA_bank_obj->data_in_buffer + 0); /*input */ uint32 L = QMFA_bank_obj->blk_len/2; uint32 M = FILT_LEN/2; uint32 * restrict hist_rd_ptr0 = (uint32 *)QMFA_bank_obj->history[0]; uint32 * restrict hist_rd_ptr1 = (uint32 *)QMFA_bank_obj->history[1]; uint32 * restrict hist_wr_ptr0 = (uint32 *)QMFA_bank_obj->history[0]; uint32 * restrict hist_wr_ptr1 = (uint32 *)QMFA_bank_obj->history[1]; uint32 * restrict data_out_lo_ptr = (uint32 *)QMFA_bank_obj->data_out_LO_ptr; uint32 * restrict data_out_hi_ptr = (uint32 *)QMFA_bank_obj->data_out_HI_ptr; for(i=0;i<L;++i) { /*Iterations 0 to (L-1) produce the current block of output*/ register uint32 o0,o1,out0,out1; register __int40_t accum0l,accum1l,accum0r,accum1r; /*accumulators for polyphase filter commponent 0 and 1*/ register long long temp; accum0l = 0;accum0r = 0; accum1l = 0;accum1r = 0; uint32 *dptr = data_ptr + 2*i + 1; uint32 *fptr = filter; #pragma MUST_ITERATE( 1) for(j=0;j<QMFA_bank_obj->iter_count[i];++j) { /* Limit for j is beginning at the rightmost overlap position to the leftmost overlap position*/ register uint32 d0,d1; register uint32 filt_coef10 = *fptr++; /*Loads the decimated filter coeficients*/ register uint32 filt_coef0 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register uint32 filt_coef1 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr--;//data_ptr1[(2*(i-j))]; /*load the data right to left with 1:2 split*/ d0 = *dptr--;//data_ptr0[(2*(i-j))]; /*load the data right to left with 1:2 split*/ temp = _mpy2ll(d0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(d1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } o0 = _pack2( _sat((accum0r + 16384)>>15), _sat((accum0l + 16384)>>15)); o1 = _pack2( _sat((accum1r + 16384)>>15), _sat((accum1l + 16384)>>15)); o0 = _add2(o0, hist_rd_ptr0[i]); /*add history sample filter 0 overlap add*/ o1 = _add2(o1, hist_rd_ptr1[i]); /*add history sample filter 1 overlap add*/ out0 = _add2(o0,o1) ; data_out_lo_ptr[i] = out0; /*out bank0 is out0 + out1*/ out1 = _sub2(o0,o1) ; data_out_hi_ptr[i] = out1; /*out bank1 is out0 - out1*/ } for(i=L;i<L+M-1;++i) { /*Iterations L to (L + (M-1) -1) produce the history for overlap add for next block*/ register uint32 o0,o1; register __int40_t accum0l,accum1l,accum0r,accum1r; /*accumulators for polyphase filter commponent 0 and 1*/ register long long temp; accum0l = 0;accum0r = 0; accum1l = 0;accum1r = 0; uint32 *dptr = data_ptr + 2*(L-1) + 1; uint32 *fptr = filter + (i-L+1); #pragma MUST_ITERATE( 1) for(j=0;j<QMFA_bank_obj->iter_count[i];++j) { /*same logic for j starting at the rightmost point of overlap to leftmost*/ register sint32 d0,d1; register sint32 filt_coef10 = *fptr++;; /*Loads the decimated filter coeficients*/ register sint32 filt_coef0 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register sint32 filt_coef1 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr--; /*load the data right to left with 1:2 split*/ d0 = *dptr--; /*load the data right to left with 1:2 split*/ temp = _mpy2ll(d0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(d1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } o0 = _pack2( _sat((accum0r + 16384)>>15), _sat((accum0l + 16384)>>15)); o1 = _pack2( _sat((accum1r + 16384)>>15), _sat((accum1l + 16384)>>15)); o0 = _add2(o0, hist_rd_ptr0[i]); /*add history sample filter 0 overlap add*/ o1 = _add2(o1, hist_rd_ptr1[i]); /*add history sample filter 1 overlap add*/ hist_wr_ptr0[(i-L)] = o0; /* write out overlap add history history filter 0*/ hist_wr_ptr1[(i-L)] = o1; /* write out overlap add history history filter 1*/ } return; }
void DSP_QMFS_process(DSP_QMFS_bank_t *QMFS_bank_obj) { sint32 i,j; uint32 * restrict filter = (uint32 *)QMFS_bank_obj->flt_ptr; uint32 * restrict data_ptr0 = (uint32 *)QMFS_bank_obj->data_in_buffer_LO; /*input buffer bank0*/ uint32 * restrict data_ptr1 = (uint32 *)QMFS_bank_obj->data_in_buffer_HI; /*input buffer bank0*/ sint32 L = QMFS_bank_obj->blk_len; sint32 M = FILT_LEN/2; uint32 * restrict hist_rd_ptr0 = (uint32 *)QMFS_bank_obj->history[0]; uint32 * restrict hist_rd_ptr1 = (uint32 *)QMFS_bank_obj->history[1]; uint32 * restrict hist_wr_ptr0 = (uint32 *)QMFS_bank_obj->history[0]; uint32 * restrict hist_wr_ptr1 = (uint32 *)QMFS_bank_obj->history[1]; uint32 * restrict dataout_ptr = (uint32 *)QMFS_bank_obj->data_out_ptr; for(i=0;i<L;++i) { /* iteration 0 to (L-1) produces the L outputs of the current block*/ register uint32 t0,t1,out; register __int40_t accum0l, accum1l, accum0r, accum1r; register long long temp; accum0l = 0; accum0r = 0; accum1l = 0; accum1r = 0; uint32 *dptr0 = data_ptr0 + i ; uint32 *dptr1 = data_ptr1 + i ; uint32 *fptr = filter; #pragma MUST_ITERATE( 1) for(j=0; j<QMFS_bank_obj->iter_count[i];++j) { /*j indexes from the point of rightmost overlap to the leftmost overlap position */ register uint32 d0,d1; register uint32 filt_coef10 = *fptr++; /*Loads the decimated filter coeficients*/ register uint32 filt_coef0 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register uint32 filt_coef1 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr1--;//data_ptr1[(2*(i-j))]; /*load the data right to left with 1:2 split*/ d0 = *dptr0--;//data_ptr0[(2*(i-j))]; /*load the data right to left with 1:2 split*/ t0 = _add2(d0,d1); t1 = _sub2(d0,d1); temp = _mpy2ll(t0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(t1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } t0 = _pack2( _sat((accum0l + 16384)>>15), _sat((accum0l + 16384)>>15)); t1 = _pack2( _sat((accum1l + 16384)>>15), _sat((accum1l + 16384)>>15)); out = _add2(t0, hist_rd_ptr0[i]); dataout_ptr[2*i + 0] = out; /*filter 0 produces the odd output*/ out = _add2(t1, hist_rd_ptr1[i]); dataout_ptr[2*i + 1] = out; /*filter 1 produces the even output*/ } for(i=L;i<L+M-1;++i) { register uint32 t0,t1,out; register __int40_t accum0l, accum1l, accum0r, accum1r; register long long temp; accum0l = 0; accum0r = 0; accum1l = 0; accum1r = 0; uint32 *dptr0 = data_ptr0 + (L-1) ; uint32 *dptr1 = data_ptr1 + (L-1) ; uint32 *fptr = filter + (i-L+1); #pragma MUST_ITERATE( 1) for(j=0; j<QMFS_bank_obj->iter_count[i];++j) { /*same logic for j starting at the rightmost point of overlap to leftmost*/ register sint32 d0,d1; register sint32 filt_coef10 = *fptr++;; /*Loads the decimated filter coeficients*/ register sint32 filt_coef0 = _pack2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ register sint32 filt_coef1 = _packh2(filt_coef10,filt_coef10); /*in the correct order i.e. with reversal*/ d1 = *dptr1--; /*load the data right to left with 1:2 split*/ d0 = *dptr0--; /*load the data right to left with 1:2 split*/ t0 = _add2(d0,d1); t1 = _sub2(d0,d1); temp = _mpy2ll(t0, filt_coef0); accum0l = _lsadd(_loll(temp), accum0l); accum0r = _lsadd(_hill(temp), accum0r); temp = _mpy2ll(t1, filt_coef1); accum1l = _lsadd(_loll(temp), accum1l); accum1r = _lsadd(_hill(temp), accum1r); } t0 = _pack2( _sat((accum0l + 16384)>>15), _sat((accum0l + 16384)>>15)); t1 = _pack2( _sat((accum1l + 16384)>>15), _sat((accum1l + 16384)>>15)); out = _add2(t0, hist_rd_ptr0[i]); hist_wr_ptr0[(i-L)] = out; /* write out overlap add history history filter 0*/ out = _add2(t1, hist_rd_ptr1[i]); hist_wr_ptr1[(i-L)] = out; /* write out overlap add history history filter 1*/ } return; }
unsigned int mask1_44, mask2_44, mask3_44; unsigned int mask4_44, mask5_44; const short *restrict in0; const short *restrict in1; const short *restrict in2; const short *restrict in3; const short *restrict in4; /* -------------------------------------------------------------------- */ /* Load mask values (reverse order for mask rotation) */ /* -------------------------------------------------------------------- */ mask_temp = _mem8_const ((void *) &mask_ptr[21]); mask1_3210 = _itoll(_packlh2(_loll(mask_temp),_loll(mask_temp)), _packlh2(_hill(mask_temp),_hill(mask_temp))); mask1_44 = _pack2((int) mask_ptr[20], (int) mask_ptr[20]); mask_temp = _mem8_const ((void *) &mask_ptr[16]); mask2_3210 = _itoll(_packlh2(_loll(mask_temp),_loll(mask_temp)), _packlh2(_hill(mask_temp),_hill(mask_temp))); mask2_44 = _pack2((int) mask_ptr[15], (int) mask_ptr[15]); mask_temp = _mem8_const ((void *) &mask_ptr[11]); mask3_3210 = _itoll(_packlh2(_loll(mask_temp),_loll(mask_temp)), _packlh2(_hill(mask_temp),_hill(mask_temp))); mask3_44 = _pack2((int) mask_ptr[10], (int) mask_ptr[10]); mask_temp = _mem8_const ((void *) &mask_ptr[6]); mask4_3210 = _itoll(_packlh2(_loll(mask_temp),_loll(mask_temp)), _packlh2(_hill(mask_temp),_hill(mask_temp))); mask4_44 = _pack2((int) mask_ptr[5], (int) mask_ptr[5]);
static __inline void *optimized_mem_set(void *mem, int ch, size_t n) { char * restrict dst1, * restrict dst2; int pre_bytes, post_bytes, wfill, i; unsigned char *outbuf = mem; unsigned int count = n; dst1 = (char *)outbuf; #if defined(_TMS320C6400) || defined(_TMS320C6740) || defined(_TMS320C6600) || \ defined(_TI_C6X_TESLA) /*---------------------------------------------------------------------*/ /* We do not use 'dwfill' on other variations of the C6x architecture, */ /* so limit 'dwfill' references to the architectures that use it. */ /*---------------------------------------------------------------------*/ { long long dwfill; /*------------------------------------------------------------------*/ /* Set up 64-bit and 32-bit fill values. */ /*------------------------------------------------------------------*/ wfill = _pack2 (ch, ch); wfill = _packl4(wfill, wfill); dwfill = _itoll (wfill, wfill); /*------------------------------------------------------------------*/ /* Calculate # of bytes to pre-copy to get to an alignment of 8 */ /*------------------------------------------------------------------*/ pre_bytes = (8 - (int) dst1) & 7; if (count > pre_bytes) { count -= pre_bytes; if (pre_bytes & 1) { *dst1 = ch; dst1 += 1; } if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; } if (pre_bytes & 4) { _amem4(dst1) = wfill; dst1 += 4; } } /*------------------------------------------------------------------*/ /* Double word fills */ /*------------------------------------------------------------------*/ post_bytes = count > 0 ? count : 0; dst2 = dst1 + 8; if (count > 15) for (i = 0; i < count >> 4; i++) { _amem8(dst1) = dwfill; dst1 += 16; _amem8(dst2) = dwfill; dst2 += 16; post_bytes -= 16; } /*------------------------------------------------------------------*/ /* Finish transfer with 8, 4, 2 and/or 1-byte writes */ /*------------------------------------------------------------------*/ if (post_bytes & 8) { _mem8(dst1) = dwfill; dst1 += 8; } if (post_bytes & 4) { _mem4(dst1) = wfill; dst1 += 4; } if (post_bytes & 2) { dst1[0] = ch; dst1[1] = ch; dst1 += 2; } if (post_bytes & 1) { *dst1 = ch; dst1 += 1; } } #else /*--------------------------------------------------------------------*/ /* Set up 32-bit fill value. */ /*--------------------------------------------------------------------*/ wfill = _mpy(0x101, (int)ch); wfill += (wfill << 16); /*--------------------------------------------------------------------*/ /* Calculate number of bytes to pre-copy to get to an alignment of 4 */ /*--------------------------------------------------------------------*/ pre_bytes = (4 - (int) dst1) & 3; if (count > pre_bytes) { count -= pre_bytes; if (pre_bytes & 1) { *dst1 = ch; dst1 += 1; } if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; } } /*--------------------------------------------------------------------*/ /* Double word fills */ /*--------------------------------------------------------------------*/ post_bytes = count > 0 ? count : 0; dst2 = dst1 + 4; if (count > 7) for (i = 0; i < count >> 3; i++) { _amem4(dst1) = wfill; dst1 += 8; _amem4(dst2) = wfill; dst2 += 8; post_bytes -= 8; } /*--------------------------------------------------------------------*/ /* Finish transfer with up to 7 single-byte writes. */ /*--------------------------------------------------------------------*/ if (post_bytes) { *dst1++ = ch; post_bytes--; } if (post_bytes) { *dst1++ = ch; post_bytes--; } if (post_bytes) { *dst1++ = ch; post_bytes--; } if (post_bytes) { *dst1++ = ch; post_bytes--; } if (post_bytes) { *dst1++ = ch; post_bytes--; } if (post_bytes) { *dst1++ = ch; post_bytes--; } if (post_bytes) { *dst1++ = ch; post_bytes--; } #endif return dst1; }