Esempi in C++ (Cpp) per _amem4

Esempio n. 1

0

Mostra file

File: interpol_0_n_TI.c Progetto: jfwang213/graduate_demo

void luma_sample_interp_0_0_TI(unsigned char image [], unsigned char refPicLXl[],	const short PicWidthSamples, const short stride){



	unsigned int uiTmp1,uiTmp2,uiTmp3,uiTmp4;
	unsigned char* pRefImgPtr = refPicLXl;
	unsigned char* pImgPtr = image;

	uiTmp1 = _mem4 (pRefImgPtr);
	pRefImgPtr+=PicWidthSamples;
	uiTmp2 = _mem4 (pRefImgPtr);
	pRefImgPtr+=PicWidthSamples;
	uiTmp3 = _mem4 (pRefImgPtr);
	pRefImgPtr+=PicWidthSamples;
	uiTmp4 = _mem4 (pRefImgPtr);

	_amem4 (pImgPtr) = uiTmp1;
	pImgPtr+=stride;
	_amem4 (pImgPtr) = uiTmp2;
	pImgPtr+=stride;
	_amem4 (pImgPtr) = uiTmp3;
	pImgPtr+=stride;
	_amem4 (pImgPtr) = uiTmp4;

}

Esempio n. 2

0

Mostra file

File: memset.c Progetto: dannf/ti-cgt-c6x

_CODE_ACCESS void *memset(void *dst, int fill, size_t len)
{   
       char  *restrict dst1, *restrict dst2;
       int    pre_bytes, post_bytes, wfill, i;

       dst1   = (char *)dst;

       /*--------------------------------------------------------------------*/
       /* Replicate the 8-bit value in fill into all 4 bytes of wfill        */
       /*--------------------------------------------------------------------*/
       wfill  = _mpy(0x101, fill & 0xff);
       wfill += wfill << 16;

       /*--------------------------------------------------------------------*/
       /* Calculate number of bytes to pre-copy to get to an alignment of 4  */
       /*--------------------------------------------------------------------*/
       pre_bytes = (4 - (int) dst) & 3;

       if (len > pre_bytes)
       {
           len -= pre_bytes;
           if (pre_bytes & 1) { *dst1        = fill;  dst1 += 1; }
           if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; }
       }

       /*--------------------------------------------------------------------*/
       /* Double word fills                                                  */
       /*--------------------------------------------------------------------*/
       post_bytes = len > 0 ? len : 0;   
       dst2       = dst1 + 4;

       if (len > 7)
         for (i = 0; i < len >> 3; i++)
         {
            _amem4(dst1) = wfill; dst1 += 8;
            _amem4(dst2) = wfill; dst2 += 8;
            post_bytes -= 8;
         }

       /*--------------------------------------------------------------------*/
       /* Finish transfer with 8, 4, 2 and/or 1-byte writes                  */
       /*--------------------------------------------------------------------*/
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       if (post_bytes) { *dst1++ = fill; post_bytes--; }
       return dst;
}

Esempio n. 3

0

Mostra file

File: memset.c Progetto: dannf/ti-cgt-c6x

_CODE_ACCESS void *memset(void *dst, int fill, size_t len)
{   
       char  *restrict dst1, *restrict dst2;
       int    pre_bytes, post_bytes, wfill, i;
       double dfill1, dfill2;

       dst1   = (char *)dst;

       /*--------------------------------------------------------------------*/
       /* Replicate the 8-bit value in fill into all 4 bytes of wfill        */
       /*--------------------------------------------------------------------*/
       wfill  = _pack2 (fill,  fill);
       wfill  = _packl4(wfill, wfill); 
       dfill1 = _itod  (wfill, wfill);
       dfill2 = _itod  (wfill, wfill);

       /*--------------------------------------------------------------------*/
       /* Calculate number of bytes to pre-copy to get to an alignment of 8  */
       /*--------------------------------------------------------------------*/
       pre_bytes = (8 - (int) dst) & 7;

       if (len > pre_bytes)
       {
           len -= pre_bytes;
           if (pre_bytes & 1) { *dst1        = fill;  dst1 += 1; }
           if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; }
           if (pre_bytes & 4) { _amem4(dst1) = wfill; dst1 += 4; }
       }

       /*--------------------------------------------------------------------*/
       /* Double word fills                                                  */
       /*--------------------------------------------------------------------*/
       post_bytes = len > 0 ? len : 0;   
       dst2       = dst1 + 8;

       if (len > 15)
         for (i = 0; i < len >> 4; i++)
         {
            _amemd8(dst1) = dfill1; dst1 += 16;
            _amemd8(dst2) = dfill2; dst2 += 16;
            post_bytes -= 16;
         }

       /*--------------------------------------------------------------------*/
       /* Finish transfer with 8, 4, 2 and/or 1-byte writes                  */
       /*--------------------------------------------------------------------*/
       if (post_bytes & 8) { _memd8(dst1) = dfill1; dst1 += 8; }
       if (post_bytes & 4) { _mem4 (dst1) = wfill;  dst1 += 4; }
       if (post_bytes & 2) {  dst1[0]     = wfill;  
                              dst1[1]     = wfill;  dst1 += 2; }
       if (post_bytes & 1) { *dst1        = fill;   dst1 += 1; }
       return dst;
}

Esempio n. 4

0

Mostra file

File: interpol_0_n_TI.c Progetto: jfwang213/graduate_demo

/**
This function allows to get the luminance prediction of a non IDR picture when xFracl = 0 and yFracl = 1.


@param image Table of current frame.
@param refPicLXl Table of the reference decoded picture buffer.
@param PicWidthSamples Stride of the reference buffer.
@param stride Stride of the current image.
*/
void luma_sample_interp_0_1_TI(unsigned char image [], unsigned char refPicLXl[],	const short PicWidthSamples, const short stride){

	/* No horizontal interpolation */


	unsigned int uiLine1,uiLine2,uiLine3,uiLine4,uiLine5,uiLine6,uiLine7,uiLine8,uiLine9;
	unsigned int uiTmpLine12_h,uiTmpLine34_h,uiTmpLine12_l,uiTmpLine34_l,uiTmpLine1234_4,uiTmpLine1234_2,uiTmpLine1234_3,uiTmpLine1234_1;
	unsigned int uiTmpLine56_h,uiTmpLine78_h,uiTmpLine56_l,uiTmpLine78_l,uiTmpLine5678_4,uiTmpLine5678_2,uiTmpLine5678_3,uiTmpLine5678_1;
	unsigned int tmpc1_1,tmpc1_2,tmpc2_1,tmpc2_2,tmpc1,tmpc2;
	unsigned int tmpc3_1,tmpc3_2,tmpc4_1,tmpc4_2,tmpc3,tmpc4;
	unsigned int tmp12,tmp34,tmpend1,tmpend2,tmpend3,tmpend4;
	unsigned int tmpl9l,tmpl9h;
	unsigned int input1,input2,input3,input4;
	unsigned char* pRefImgPtr;
	unsigned char* pImgPtr;

	pRefImgPtr = refPicLXl-(PicWidthSamples<<1);
	pImgPtr = image;

	uiLine1 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine2 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine3 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine4 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine5 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine6 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine7 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine8 = _mem4(pRefImgPtr);
	pRefImgPtr += PicWidthSamples;
	uiLine9 = _mem4(pRefImgPtr);


	input1 = uiLine3;
	input2 = uiLine4;
	input3 = uiLine5;
	input4 = uiLine6;


	uiTmpLine12_h = _packh4(uiLine1,uiLine2);
	uiTmpLine34_h = _packh4(uiLine3,uiLine4);
	uiTmpLine12_l = _packl4(uiLine1,uiLine2);
	uiTmpLine34_l = _packl4(uiLine3,uiLine4);
	uiTmpLine1234_4 = _packh4(uiTmpLine12_h,uiTmpLine34_h);
	uiTmpLine1234_2 = _packl4(uiTmpLine12_h,uiTmpLine34_h);
	uiTmpLine1234_3 = _packh4(uiTmpLine12_l,uiTmpLine34_l);
	uiTmpLine1234_1 = _packl4(uiTmpLine12_l,uiTmpLine34_l);

	uiTmpLine56_h = _packh4(uiLine5,uiLine6);
	uiTmpLine78_h = _packh4(uiLine7,uiLine8);
	uiTmpLine56_l = _packl4(uiLine5,uiLine6);
	uiTmpLine78_l = _packl4(uiLine7,uiLine8);
	uiTmpLine5678_4 = _packh4(uiTmpLine56_h,uiTmpLine78_h);
	uiTmpLine5678_2 = _packl4(uiTmpLine56_h,uiTmpLine78_h);
	uiTmpLine5678_3 = _packh4(uiTmpLine56_l,uiTmpLine78_l);
	uiTmpLine5678_1 = _packl4(uiTmpLine56_l,uiTmpLine78_l);

	tmpc1_1 = _dotpsu4(0x01FB1414,uiTmpLine1234_1);
	tmpc1_2 = _dotpsu4(0xFB010000,uiTmpLine5678_1);

	tmpc2_1 = _dotpsu4(0x01FB1414,uiTmpLine1234_2);
	tmpc2_2 = _dotpsu4(0xFB010000,uiTmpLine5678_2);

	tmpc1 = _spack2(tmpc1_1,tmpc2_1);
	tmpc2 = _spack2(tmpc1_2,tmpc2_2);

	tmp12 = _sadd2(tmpc1,tmpc2);
	tmp12 = _shr2(_sadd2(tmp12,0x00100010),5);

	tmpc3_1 = _dotpsu4(0x01FB1414,uiTmpLine1234_3);
	tmpc3_2 = _dotpsu4(0xFB010000,uiTmpLine5678_3);

	tmpc4_1 = _dotpsu4(0x01FB1414,uiTmpLine1234_4);
	tmpc4_2 = _dotpsu4(0xFB010000,uiTmpLine5678_4);

	tmpc3 = _spack2(tmpc3_1,tmpc4_1);
	tmpc4 = _spack2(tmpc3_2,tmpc4_2);

	tmp34 = _sadd2(tmpc3,tmpc4);
	tmp34 = _shr2(_sadd2(tmp34,0x00100010),5);

	tmpend1 = _spacku4(tmp34,tmp12);
	tmpend1 = _swap4(tmpend1);

	_amem4(pImgPtr) = _avgu4(tmpend1,input1);
	pImgPtr += stride;

	tmpc1_1 = _dotpsu4(0x0001FB14,uiTmpLine1234_1);
	tmpc1_2 = _dotpsu4(0x14FB0100,uiTmpLine5678_1);

	tmpc2_1 = _dotpsu4(0x0001FB14,uiTmpLine1234_2);
	tmpc2_2 = _dotpsu4(0x14FB0100,uiTmpLine5678_2);

	tmpc1 = _spack2(tmpc1_1,tmpc2_1);
	tmpc2 = _spack2(tmpc1_2,tmpc2_2);

	tmp12 = _sadd2(tmpc1,tmpc2);
	tmp12 = _shr2(_sadd2(tmp12,0x00100010),5);

	tmpc3_1 = _dotpsu4(0x0001FB14,uiTmpLine1234_3);
	tmpc3_2 = _dotpsu4(0x14FB0100,uiTmpLine5678_3);

	tmpc4_1 = _dotpsu4(0x0001FB14,uiTmpLine1234_4);
	tmpc4_2 = _dotpsu4(0x14FB0100,uiTmpLine5678_4);

	tmpc3 = _spack2(tmpc3_1,tmpc4_1);
	tmpc4 = _spack2(tmpc3_2,tmpc4_2);

	tmp34 = _sadd2(tmpc3,tmpc4);
	tmp34 = _shr2(_sadd2(tmp34,0x00100010),5);

	tmpend2 = _spacku4(tmp34,tmp12);
	tmpend2 = _swap4(tmpend2);

	_amem4(pImgPtr) = _avgu4(tmpend2,input2);
	pImgPtr += stride;

	tmpc1_1 = _dotpsu4(0x000001FB,uiTmpLine1234_1);
	tmpc1_2 = _dotpsu4(0x1414FB01,uiTmpLine5678_1);

	tmpc2_1 = _dotpsu4(0x000001FB,uiTmpLine1234_2);
	tmpc2_2 = _dotpsu4(0x1414FB01,uiTmpLine5678_2);

	tmpc1 = _spack2(tmpc1_1,tmpc2_1);
	tmpc2 = _spack2(tmpc1_2,tmpc2_2);

	tmp12 = _sadd2(tmpc1,tmpc2);
	tmp12 = _shr2(_sadd2(tmp12,0x00100010),5);

	tmpc3_1 = _dotpsu4(0x000001FB,uiTmpLine1234_3);
	tmpc3_2 = _dotpsu4(0x1414FB01,uiTmpLine5678_3);

	tmpc4_1 = _dotpsu4(0x000001FB,uiTmpLine1234_4);
	tmpc4_2 = _dotpsu4(0x1414FB01,uiTmpLine5678_4); 

	tmpc3 = _spack2(tmpc3_1,tmpc4_1);
	tmpc4 = _spack2(tmpc3_2,tmpc4_2);

	tmp34 = _sadd2(tmpc3,tmpc4);
	tmp34 = _shr2(_sadd2(tmp34,0x00100010),5);

	tmpend3 = _spacku4(tmp34,tmp12);
	tmpend3 = _swap4(tmpend3);

	_amem4(pImgPtr) = _avgu4(tmpend3,input3);
	pImgPtr += stride;

	uiLine9 = _swap4(uiLine9);
	tmpl9h = _unpkhu4 (uiLine9);
	tmpl9l = _unpklu4 (uiLine9);

	tmpc1_1 = _extu(uiTmpLine1234_1,24,24);//_dotpsu4(0x00000001,uiTmpLine1234_1);
	tmpc1_2 = _dotpsu4(0xFB1414FB,uiTmpLine5678_1);

	tmpc2_1 = _extu(uiTmpLine1234_2,24,24);//_dotpsu4(0x00000001,uiTmpLine1234_2);
	tmpc2_2 = _dotpsu4(0xFB1414FB,uiTmpLine5678_2);

	tmpc1 = _spack2(tmpc1_1,tmpc2_1);
	tmpc2 = _spack2(tmpc1_2,tmpc2_2);

	tmp12 = _sadd2(tmpc1,tmpc2);
	tmp12 = _sadd2(tmp12,tmpl9l);
	tmp12 = _shr2(_sadd2(tmp12,0x00100010),5);

	tmpc3_1 = _extu(uiTmpLine1234_3,24,24);//_dotpsu4(0x00000001,uiTmpLine1234_3);
	tmpc3_2 = _dotpsu4(0xFB1414FB,uiTmpLine5678_3);

	tmpc4_1 = _extu(uiTmpLine1234_4,24,24);//_dotpsu4(0x00000001,uiTmpLine1234_4);
	tmpc4_2 = _dotpsu4(0xFB1414FB,uiTmpLine5678_4);

	tmpc3 = _spack2(tmpc3_1,tmpc4_1);
	tmpc4 = _spack2(tmpc3_2,tmpc4_2);

	tmp34 = _sadd2(tmpc3,tmpc4);
	tmp34 = _sadd2(tmp34,tmpl9h);
	tmp34 = _shr2(_sadd2(tmp34,0x00100010),5);

	tmpend4 = _spacku4(tmp34,tmp12);
	tmpend4 = _swap4(tmpend4);

	_amem4(pImgPtr) = _avgu4(tmpend4,input4);

}

Esempio n. 5

0

Mostra file

File: ORILIB_BufferAlign_inner.c Progetto: manub686/orilib

	Cplx16 const * const restrict unaligned_raw_samples, 
	ORILIB_t_AlignState * alignStateInpOut,
	Cplx16 * const restrict aligned_raw_samples
	){

#ifdef DEBUG_MODE
	assert(alignStateInpOut->nAlignedSamplesAlreadyFilled <= SYNC_BUFFER_SIZE_ENERGY * 2);
	//implementing the same scheme without this condition will take more thought, which i
	//didn't want to put in at date(then).

	assert(SYNC_ALIGNED_SAMPLE_BUF_LEN_ACTUAL >= 4 * SYNC_BUFFER_SIZE_ENERGY);
#endif


	//get indices
	Uint32 uaks1 = _amem4(&alignStateInpOut->uaks1);
	Uint32 uaks2 = _amem4(&alignStateInpOut->uaks2);
	Uint32 offset = _amem4(&alignStateInpOut->nAlignedSamplesAlreadyFilled);

	Uint32 uai1 = uaks1 + offset;
	//Uint32 uai2 = uaks2 + offset;
	//uar = uak1, so need need to create a separate variable

	Cplx16 *alignedSampleBuf;

	Uint32 i;

	Cplx16 * alignedSampleBufOut = alignStateInpOut->alignedSampleLookbackBuf;


	//careful with copying pointers -- honor the restrict flags and do not copy

Esempio n. 6

0

Mostra file

File: cxcmp.cpp Progetto: Jeaniowang/EasyMulticoreDSP

CV_IMPL  void
cvAbsDiff( const void* srcarr1, const void* srcarr2, void* dstarr )
{

    CV_FUNCNAME( "cvAbsDiff" );

    __BEGIN__;

    int coi1 = 0, coi2 = 0, coi3 = 0;
    CvMat srcstub1, *src1 = (CvMat*)srcarr1;
    CvMat srcstub2, *src2 = (CvMat*)srcarr2;
    CvMat dststub,  *dst = (CvMat*)dstarr;
    CvSize size;
    int type, depth, pixel_size;

    CV_CALL( src1 = cvGetMat( src1, &srcstub1, &coi1 ));
    CV_CALL( src2 = cvGetMat( src2, &srcstub2, &coi2 ));
    CV_CALL( dst = cvGetMat( dst, &dststub, &coi3 ));

    if( coi1 != 0 || coi2 != 0 || coi3 != 0 )
        CV_ERROR( CV_BadCOI, "" );

    if( !CV_ARE_SIZES_EQ( src1, src2 ) )
        CV_ERROR_FROM_CODE( CV_StsUnmatchedSizes );

    type = CV_MAT_TYPE(src1->type);
    depth = CV_MAT_DEPTH(type);

    if( !CV_ARE_SIZES_EQ( src1, dst ))
        CV_ERROR_FROM_CODE( CV_StsUnmatchedSizes );

    if( !CV_ARE_TYPES_EQ( src1, src2 ))
        CV_ERROR_FROM_CODE( CV_StsUnmatchedFormats );

    if( !CV_ARE_TYPES_EQ( src1, dst ))
        CV_ERROR_FROM_CODE( CV_StsUnmatchedFormats );

    size.width = src1->step * src1->height;
    size.height = 1;
	pixel_size = CV_DEPTH_BYTES[depth];

	if(depth == CV_8U)
	{
		int idx;
		unsigned char * p1;
		unsigned char * p2;
		unsigned char * pdst; 
		p1 = src1->data.ptr ; 
		p2 = src2->data.ptr; 
		pdst = dst->data.ptr; 
		
#ifdef _TMS320C6X
       	for (idx = 0; idx < size.width/pixel_size; idx+=4)
       	{
       		_amem4(pdst) = _subabs4(_amem4_const(p1), _amem4_const(p2) );
       		p1 += 4;
       		p2 += 4;
       		pdst += 4;
       	}	
#else
       	for (idx = 0; idx < size.width/pixel_size; idx+=1)
       	{
			(*pdst) = abs((*p1)-(*p2));
			pdst++;
			p1++;
			p2++;
       	}	
#endif
    }
	else if(depth == CV_32S)        
	{
		int idx;
		int * p1;
		int * p2;
		int * pdst; 
		p1 = src1->data.i; 
		p2 = src2->data.i; 
		pdst = dst->data.i;

       	for (idx = 0; idx < size.width/pixel_size; idx++)
       	{
#ifdef _TMS320C6X
       		*pdst = _abs(_ssub(*p1, *p2));
#else
       		*pdst = abs((*p1)-(*p2));
#endif
       		p1 += 1;
       		p2 += 1;
       		pdst += 1;
       	}	
    }
    else
    {
		CV_ERROR( CV_StsUnsupportedFormat, "unsupported matrix type." );
	}

    __END__;
}

Esempio n. 7

0

Mostra file

File: copy_zero_init.c Progetto: dannf/ti-cgt-c6x

static __inline void *optimized_mem_set(void *mem, int ch, size_t n)
{
   char  * restrict dst1, * restrict dst2;
   int    pre_bytes, post_bytes, wfill, i;

   unsigned char *outbuf = mem;
   unsigned int count = n;

   dst1 = (char *)outbuf;

#if defined(_TMS320C6400) || defined(_TMS320C6740) || defined(_TMS320C6600) || \
    defined(_TI_C6X_TESLA)

   /*---------------------------------------------------------------------*/
   /* We do not use 'dwfill' on other variations of the C6x architecture, */
   /* so limit 'dwfill' references to the architectures that use it.      */
   /*---------------------------------------------------------------------*/
   {
      long long dwfill;

      /*------------------------------------------------------------------*/
      /* Set up 64-bit and 32-bit fill values.                            */
      /*------------------------------------------------------------------*/
      wfill  = _pack2 (ch, ch);
      wfill  = _packl4(wfill, wfill);
      dwfill = _itoll (wfill, wfill); 

      /*------------------------------------------------------------------*/
      /* Calculate # of bytes to pre-copy to get to an alignment of 8     */
      /*------------------------------------------------------------------*/
      pre_bytes = (8 - (int) dst1) & 7;
       
      if (count > pre_bytes)
      {
         count -= pre_bytes;
         if (pre_bytes & 1) { *dst1        = ch;    dst1 += 1; }
         if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; }
         if (pre_bytes & 4) { _amem4(dst1) = wfill; dst1 += 4; }
      } 
            
      /*------------------------------------------------------------------*/
      /* Double word fills                                                */
      /*------------------------------------------------------------------*/
      post_bytes = count > 0 ? count : 0;
      dst2       = dst1 + 8;
       
      if (count > 15)
         for (i = 0; i < count >> 4; i++)
         {
            _amem8(dst1) = dwfill; dst1 += 16;
            _amem8(dst2) = dwfill; dst2 += 16;
            post_bytes -= 16;
         }
       
      /*------------------------------------------------------------------*/
      /* Finish transfer with 8, 4, 2 and/or 1-byte writes                */
      /*------------------------------------------------------------------*/
      if (post_bytes & 8) { _mem8(dst1) = dwfill; dst1 += 8; }
      if (post_bytes & 4) { _mem4(dst1) = wfill;  dst1 += 4; }
      if (post_bytes & 2) {  dst1[0]    = ch;  
      dst1[1]    = ch;     dst1 += 2; }
      if (post_bytes & 1) { *dst1       = ch;     dst1 += 1; }
   }

#else

   /*--------------------------------------------------------------------*/
   /* Set up 32-bit fill value.                                          */
   /*--------------------------------------------------------------------*/
   wfill  = _mpy(0x101, (int)ch);
   wfill += (wfill << 16);

   /*--------------------------------------------------------------------*/
   /* Calculate number of bytes to pre-copy to get to an alignment of 4  */
   /*--------------------------------------------------------------------*/
   pre_bytes = (4 - (int) dst1) & 3;

   if (count > pre_bytes)
   {
      count -= pre_bytes;
      if (pre_bytes & 1) { *dst1        = ch;    dst1 += 1; }
      if (pre_bytes & 2) { _amem2(dst1) = wfill; dst1 += 2; }
   }

   /*--------------------------------------------------------------------*/
   /* Double word fills                                                  */
   /*--------------------------------------------------------------------*/
   post_bytes = count > 0 ? count : 0;
   dst2       = dst1 + 4;

   if (count > 7)
      for (i = 0; i < count >> 3; i++)
      {
         _amem4(dst1) = wfill; dst1 += 8;
         _amem4(dst2) = wfill; dst2 += 8;
         post_bytes -= 8;
      }

   /*--------------------------------------------------------------------*/
   /* Finish transfer with up to 7 single-byte writes.                   */
   /*--------------------------------------------------------------------*/
   if (post_bytes) { *dst1++ = ch; post_bytes--; }
   if (post_bytes) { *dst1++ = ch; post_bytes--; }
   if (post_bytes) { *dst1++ = ch; post_bytes--; }
   if (post_bytes) { *dst1++ = ch; post_bytes--; }
   if (post_bytes) { *dst1++ = ch; post_bytes--; }
   if (post_bytes) { *dst1++ = ch; post_bytes--; }
   if (post_bytes) { *dst1++ = ch; post_bytes--; }

#endif

   return dst1;
}