Esempio n. 1
0
/** VBX Motion Estimation.
 *  Similar to the scalar version but scans vertically as it makes it easier to align vectors.
 *  vbw_mtx_motest_byte_setup should be run prior to running this function.
 *
 *  @param[out] result
 *  @param[in] x
 *  @param[in] y
 *  @param[in] m
 *  @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m)
{
	int  j;

	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img  +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	// column-ize the reference block
	vbx_set_vl( m->block_width );
	vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
	vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 );

	//Do column by column

	for( j=0; j < m->search_width; j++ )
	{
		// column-ize the search image
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height+m->search_height,  m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
		vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 );

		// search the image columnwise
		vbx_set_vl( m->block_width*m->block_height );
		vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0,  m->block_width*sizeof(input_type) );
		vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub );
	}

	// Write back result
	vbx_dma_to_host( result, m->v_result, m->result_size );

	return VBW_SUCCESS;
}
Esempio n. 2
0
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*.
 *  This function uses a merge reverse algorithm that is faster on large vectors.
 *  @pre v_src contains the elements to reverse.
 *  @pre v_src, v_scratch0, and v_scratch1 must all be the same length.
 *  @pre v_scratch1 and v_src must not overlap.
 *  @pre v_src *may* overlap v_scratch0 (will clobber v_src).
 *  @pre MXP must be 2 lanes or more.
 *  @pre N is a multiple of SP_WIDTH_B.
 *  @pre NUM_ROWS == N*4 / SP_WIDTH_B.
 *  @pre v_mask must be SP_WIDTH_B bytes long.
 *  @post v_scratch0 and v_scratch1 contents are modified, with one containing the result.
 *  @post v_src clobbered only if v_src overlaps v_scratch0.
 *
 *  @param[in]  v_scratch1 *in scratch*.
 *  @param[in]  v_src *in scratch*.
 *  @param[in]  N is the number of words to reverse.
 *  @param[in]  v_scratch0 *in scratch*.
 *  @param[in]  v_mask *in scratch*.
 *  @param[in]  SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time.
 *  @param[in]  NUM_ROWS is the number of rows of length SP_WIDTH_B bytes.
 *  @param[in]  rot16 TRUE to swap upper and lower half-words of each word in result.
 *  @returns    the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1,
 *              and will depend on log2(MXP vector lanes).
 */
static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0,
                                    vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 )
{
#if !VBX_SKIP_ALL_CHECKS
	if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) {
		VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short.");
		VBX_EXIT(-1);
	}
#endif

	vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 };
	unsigned int W = SP_WIDTH_B/4/2;                                         // half the number of words in a row
	unsigned int sel = 1;

	if( rot16 ) {
		vbx_set_vl( W );
		vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B );
		vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src );
		vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) );
	} else {
		vbx_set_vl( W );
		vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 );
		vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 );
		vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 );
	}

	vbx_set_vl( SP_WIDTH_B/4 );
	vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 );

	while( W > 1 ) {
		// set up odd/even mask register
		W /= 2;
		vbx( SEW, VAND, v_mask, W, 0 );
		vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask );
		vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask );
		sel = !sel;
	}

	return v_scratch[sel];
}
Esempio n. 3
0
File: test.c Progetto: 8l/mxp
int VBX_T(vbw_vec_reverse_test)()
{
	unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65,
	                      66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224,
	                      256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900,
	                      900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800,
	                      2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300,
	                      4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000,
	                      13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000,
	                      45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 };

	int retval;
	unsigned int N;
	unsigned int NBYTES;
	unsigned int NREPS = 100;
	unsigned int i,k;

	vbx_timestamp_t start=0,finish=0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;

	for( i=0; i<sizeof(aN)/4; i++ ) {
		N = aN[i];
		//printf( "testing with vector size %d\n", N );

		NBYTES = sizeof(vbx_sp_t)*N;
		if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue;

		vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES );
		vbx_sp_t *vdst = vbx_sp_malloc( NBYTES );
		//printf("bytes alloc: %d\n", NBYTES );

		if( !vsrc ) VBX_EXIT(-1);
		if( !vdst ) VBX_EXIT(-1);

		#if   ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF )
			unsigned int mask = 0x007F;
		#elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF )
			unsigned int mask = 0x7FFF;
		#else
			unsigned int mask = 0xFFFF;
		#endif

		vbx_set_vl( N );
		vbx( SV(T), VMOV, vdst,   -1, 0 );       // Fill the destination vector with -1
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Fill the source vector with enumerated values
		//VBX_T(print_vector)( "vsrcInit", vsrc, N );
		//VBX_T(print_vector)( "vdstInit", vdst, N );

		/** measure performance of function call **/
		vbx_sync();
		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N );
			vbx_sync();
		}
		finish = vbx_timestamp();
		printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );
		//VBX_T(print_vector)( "vsrcPost", vsrc, N );
		//VBX_T(print_vector)( "vdstPost", vdst, N );

		#if VERIFY_VBWARE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif

		printf("\treturn value: %X", retval);

		vbx_set_vl( N );
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Reset the source vector

		/** measure performance of simple algorithm **/
		vbx_sync();
		vbx_set_vl( 1 );
		vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );

		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 );
			vbx_sync();
		}
		finish = vbx_timestamp();

		printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );

		#if VERIFY_SIMPLE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif
			printf("\tcycles\n");

		vbx_sp_free();
	}

	vbx_sp_free();
	printf("All tests passed successfully.\n");

	return 0;
}
Esempio n. 4
0
/* takes in precomputed bfly */
static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real)
{
	int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num;
	short  wr, wi;

	vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp;
	vptr_half v_twr, v_twi;
	vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip;
	vptr_half v_temp;
	vptr_half v_twr2, v_twi2;
	const int n = 1 << m;
	const int half = n >> 1;

	scale = 0;
	mul = 0;
	swap = m >> 1;

	l = m-1;
	flight = 1;
	bfly = half;

	const int INROWS = 1<<swap;
	const int INCOLS = 1<<(m-swap);

	if ( !(m%2) ){
		swap--;
	}

	// allocate space in vector memory for vectors
	v_fr  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	v_twr   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_temp  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	if( v_fr  == NULL || v_fi == NULL  || v_fr2 == NULL || v_fi2== NULL  || \
	    v_twr == NULL || v_twi == NULL || v_temp == NULL) {
	 	VBX_EXIT(-1);
	}

	v_twr2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	if( v_twr2 == NULL || v_twi2 == NULL) {
	 	VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) );

#if 1
        if(real){
            vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse);
        }
#endif

	while (l > swap) {
		if (inverse) {
			// variable scaling, depending upon data
			shift = 0;
			if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) {
				shift = 1;
				scale++;
			}
		} else {
			// fixed scaling, for proper normalization
			// -- overall factor of 1/n, distributed to maximize arithmetic accuracy
			shift = 1;
		}
		// shift will be performed on each data point exactly once during pass

		SWAP( v_fr, v_fr2, v_tmp );
		SWAP( v_fi, v_fi2, v_tmp );

		if (shift){
			vbx_set_vl( n );
			vbx(SVH,VSHR,  v_fr2, 1,  v_fr2 );
			vbx(SVH,VSHR,  v_fi2, 1,  v_fi2 );
		}

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fr[n>>1],  v_fr2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fi2,      v_twi );

		vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp );

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fi[n>>1],  v_fi2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fr2,      v_twi );

		vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VADD,    &v_fi[n>>1], &v_fi[n>>1], v_temp );

		l--;
		mul++;
		flight <<= 1;

		if( l > swap ) {
			vbx_set_vl( 1<<l );
			vbx( VVWH, VMOV, v_twr, v_twr, 0 );
			vbx( VVWH, VMOV, v_twi, v_twi, 0 );
		}
	}

	if ( !(m%2) ) {
		l++;
		flight >>=1;
	}
Esempio n. 5
0
void vbx_mtx_fdct( vbx_mtx_fdct_t *v, dt *block_v, dt *image,
	int start_x, int start_y, int end_x, int end_y,int num_tile_x, int num_tile_y )
{
//	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
//	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int BIG_TILE_SIZE = num_tile_x * num_tile_y * DCT_SIZE;

	int next_x=start_x+1;
	int next_y=start_y;
	int get_next=1;
	if( start_x == end_x   &&   start_y == end_y ) {
		get_next=0;
	}
	if( start_x == end_x ) {
		next_x = 0;
		next_y++;
	} 

	const vbx_half_t *vimageDMA = v->vimage[!v->db]; // dma
//	const vbx_half_t *vblockDMA = v->vblock[!v->db]; // dma // never used directly 

	const vbx_half_t *vimageVPU = v->vimage[ v->db]; // active
	const vbx_half_t *vblockVPU = v->vblock[ v->db]; // active

	const vbx_half_t *vblockTMP = v->vblock[ 2    ]; // temp

	const vbx_half_t *vcoeff    = v->vcoeff;
	const vbx_half_t *vprods    = v->vprods;
	const vbx_half_t *vaccum    = v->vaccum;
	const vbx_half_t *vflags    = v->vflags;

#if DMA
	// First, prefetch the next chunk of the next image for a future call to fdct_tile()
#if NUM_TILE_Y > 1
	if( get_next ) // get row 0
		getBigTileImageY( vimageDMA,
		        image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 );
#else
	if( get_next ) // get row 0
		getBigTileImage( vimageDMA,
		        image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 );
#endif
#endif

	int r;
	for( r=0; r < BLOCK_SIZE; r++ ) {
		// perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE );                                                                                              // for the length of tiled rows
		vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt),                                    0, NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows of tiled coeffiencents
		vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt),     NUM_TILE_X * DCT_SIZE*sizeof(dt),                               0  ); // for all groups Y
		vbx_3D( VVH, VMUL,                                vprods, vimageVPU + r*NUM_TILE_X*BLOCK_SIZE,                            vcoeff); // for all 'columns' of tiled data

#if ACCUMULATE
		// accumulate the multiply operations
#if 0 & USE_ACCUM_FLAGS 
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods+0, vprods+1 );
		vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) );
		vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags );
#elif BLOCK4
                //case DCT 4
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods, vprods+1 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+3 );
		vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags );
#else
                //correct?
		vbx_set_vl( BLOCK_SIZE );
		vbx_set_2D( BLOCK_SIZE,   NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) );
		vbx_set_3D( NUM_TILE_X,   BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt) );
#if NUM_TILE_Y == 1
		vbx_acc_3D( VVH, VOR,   vblockTMP + r,      vprods ,  vprods );
#else
		int y; 
		for (y=0; y< NUM_TILE_Y; y++){
			vbx_acc_3D( VVH, VOR,   vblockTMP + r + y*NUM_TILE_X*DCT_SIZE,      vprods+ y*NUM_TILE_X*DCT_SIZE,  vprods+ y*NUM_TILE_X*DCT_SIZE );
		}
#endif
#endif
#endif

#if 0
// dont do DMA READS here yet. a DMA WRITE may still be in progress, give it chance to finish
#if DMA
		// every other iteration, prefetch the next row of the next image
		// NB: with 2D DMA, we could issue this as a single DMA request at the top of the file
		// instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue
#if NUM_TILE_Y > 1
		if( !(r&1) && get_next )
			getBigTileImageY( vimageDMA,
			                  image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                  (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added
#else
		if( !(r&1) && get_next )
			getBigTileImage( vimageDMA,
			                 image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                 (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added
#endif
#endif
#endif
	}

	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE );
	vbx( SVH, VSHR, vblockTMP, SHIFT_AMOUNT, vblockTMP );

	// now do the transposed version

	for( r=0; r < BLOCK_SIZE; r++ ) {
		// perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE );                                                                                              // for the length of tiled rows
		vbx_set_2D( BLOCK_SIZE, NUM_TILE_X * BLOCK_SIZE*sizeof(dt),     NUM_TILE_X * BLOCK_SIZE*sizeof(dt),                            0 ); // for all 'columns' of tiled data 
		vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt),       NUM_TILE_X * DCT_SIZE*sizeof(dt),                              0 ); // for all groups Y
		vbx_3D( VVH, VMUL,                             vprods,                        vblockTMP,  vcoeff + r*NUM_TILE_X*BLOCK_SIZE); // for all rows of tiled coeffients 

#if ACCUMULATE
		// accumulate the multiply operations
#if 0 & USE_ACCUM_FLAGS
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods+0, vprods+1 );
		vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) );
		vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VCMV_Z, vblockVPU+r, vaccum,   vflags );

#elif BLOCK4
		//case DCT 4
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods, vprods+1 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+3 );
		//vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags );
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE - (BLOCK_SIZE-1) );                    // for the length of a tiled row
		vbx_set_2D( BLOCK_SIZE, 1*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0);// for all tiled rows 
#if NUM_TILE_Y == 1
		vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE, vaccum, vflags  );  // 
#else
		int y;
		for (y=0; y< NUM_TILE_Y; y++){
			vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE , vaccum+y*NUM_TILE_X*DCT_SIZE, vflags  );  // 
		}
#endif
#else
		//correct?
		vbx_set_vl( BLOCK_SIZE );                                                                                              // for the length of a row
		vbx_set_2D( BLOCK_SIZE,   sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) );            // for all rows in that block
		vbx_set_3D( NUM_TILE_X,   BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt) ); // for all tiled blocks horizontally(x)
#if NUM_TILE_Y == 1
		vbx_acc_3D( VVH, VOR,   vblockVPU + r*NUM_TILE_X*BLOCK_SIZE ,    vprods ,  vprods );
#else
		int y;
		for (y=0; y< NUM_TILE_Y; y++){ 
			vbx_acc_3D( VVH, VOR,   vblockVPU + r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE,      vprods+ y*NUM_TILE_X*DCT_SIZE,  vprods+ y*NUM_TILE_X*DCT_SIZE );
		}
#endif
#endif
#endif

#if DMA
		// every other iteration, prefetch the next row of the next image
		// NB: with 2D DMA, we could issue this as a single DMA request at the top of the file
		// instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue
#if NUM_TILE_Y > 1
		//if( !(r&1) && r<(BLOCK_SIZE-1)  && get_next )
		if( get_next )
			getBigTileImageY( 
			                  vimageDMA,
			                  image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                  r );
			                  //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows
#else
		//if( !(r&1) && r<(BLOCK_SIZE-1)  && get_next )
		if( get_next )
			getBigTileImage( vimageDMA,
			                 image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                 r );
			                 //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows
#endif
#endif
	}

	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE );
	vbx( SVH, VSHR, vblockVPU, SHIFT_AMOUNT, vblockVPU );
#if DMA2
	// Write result back to memory as one big block
	vbx_dma_to_host( block_v, vblockVPU, BIG_TILE_SIZE*sizeof(dt) );
#endif 

	v->db = !v->db;
#ifdef DEBUG 
	{
		vbx_sync();
		int i,j;
		printf("%d\n", !db);
		for(i=0;i<BLOCK_SIZE*NUM_TILE_Y;i++){
			for(j=0;j<BLOCK_SIZE*NUM_TILE_X;j++){
				printf(" %4d", block_v[i*BLOCK_SIZE*NUM_TILE_X+j]);
			}
			printf("\n");
		}
	}
#endif
}