/** VBX Motion Estimation. * Similar to the scalar version but scans vertically as it makes it easier to align vectors. * vbw_mtx_motest_byte_setup should be run prior to running this function. * * @param[out] result * @param[in] x * @param[in] y * @param[in] m * @returns negative on error condition. See vbw_exit_codes.h */ int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m) { int j; int sub_block_width = m->block_width+m->search_width; for( j = 0; j < m->block_height; j++ ) { vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) ); } for( j = 0; j < m->block_height+m->search_height; j++ ) { vbx_dma_to_vector( m->v_img +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) ); } // column-ize the reference block vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 ); vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 ); //Do column by column for( j=0; j < m->search_width; j++ ) { // column-ize the search image vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height+m->search_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 ); vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 ); // search the image columnwise vbx_set_vl( m->block_width*m->block_height ); vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0, m->block_width*sizeof(input_type) ); vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub ); } // Write back result vbx_dma_to_host( result, m->v_result, m->result_size ); return VBW_SUCCESS; }
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*. * This function uses a merge reverse algorithm that is faster on large vectors. * @pre v_src contains the elements to reverse. * @pre v_src, v_scratch0, and v_scratch1 must all be the same length. * @pre v_scratch1 and v_src must not overlap. * @pre v_src *may* overlap v_scratch0 (will clobber v_src). * @pre MXP must be 2 lanes or more. * @pre N is a multiple of SP_WIDTH_B. * @pre NUM_ROWS == N*4 / SP_WIDTH_B. * @pre v_mask must be SP_WIDTH_B bytes long. * @post v_scratch0 and v_scratch1 contents are modified, with one containing the result. * @post v_src clobbered only if v_src overlaps v_scratch0. * * @param[in] v_scratch1 *in scratch*. * @param[in] v_src *in scratch*. * @param[in] N is the number of words to reverse. * @param[in] v_scratch0 *in scratch*. * @param[in] v_mask *in scratch*. * @param[in] SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time. * @param[in] NUM_ROWS is the number of rows of length SP_WIDTH_B bytes. * @param[in] rot16 TRUE to swap upper and lower half-words of each word in result. * @returns the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1, * and will depend on log2(MXP vector lanes). */ static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0, vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 ) { #if !VBX_SKIP_ALL_CHECKS if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) { VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short."); VBX_EXIT(-1); } #endif vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 }; unsigned int W = SP_WIDTH_B/4/2; // half the number of words in a row unsigned int sel = 1; if( rot16 ) { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) ); } else { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 ); } vbx_set_vl( SP_WIDTH_B/4 ); vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 ); while( W > 1 ) { // set up odd/even mask register W /= 2; vbx( SEW, VAND, v_mask, W, 0 ); vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask ); vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask ); sel = !sel; } return v_scratch[sel]; }
int VBX_T(vbw_vec_reverse_test)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int i,k; vbx_timestamp_t start=0,finish=0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); NBYTES = sizeof(vbx_sp_t)*N; if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue; vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES ); vbx_sp_t *vdst = vbx_sp_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !vsrc ) VBX_EXIT(-1); if( !vdst ) VBX_EXIT(-1); #if ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF ) unsigned int mask = 0x007F; #elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF ) unsigned int mask = 0x7FFF; #else unsigned int mask = 0xFFFF; #endif vbx_set_vl( N ); vbx( SV(T), VMOV, vdst, -1, 0 ); // Fill the destination vector with -1 vbx( SE(T), VAND, vsrc, mask, 0 ); // Fill the source vector with enumerated values //VBX_T(print_vector)( "vsrcInit", vsrc, N ); //VBX_T(print_vector)( "vdstInit", vdst, N ); /** measure performance of function call **/ vbx_sync(); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N ); vbx_sync(); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); //VBX_T(print_vector)( "vsrcPost", vsrc, N ); //VBX_T(print_vector)( "vdstPost", vdst, N ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); vbx_set_vl( N ); vbx( SE(T), VAND, vsrc, mask, 0 ); // Reset the source vector /** measure performance of simple algorithm **/ vbx_sync(); vbx_set_vl( 1 ); vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 ); vbx_sync(); } finish = vbx_timestamp(); printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_sp_free(); } vbx_sp_free(); printf("All tests passed successfully.\n"); return 0; }
/* takes in precomputed bfly */ static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real) { int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num; short wr, wi; vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp; vptr_half v_twr, v_twi; vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip; vptr_half v_temp; vptr_half v_twr2, v_twi2; const int n = 1 << m; const int half = n >> 1; scale = 0; mul = 0; swap = m >> 1; l = m-1; flight = 1; bfly = half; const int INROWS = 1<<swap; const int INCOLS = 1<<(m-swap); if ( !(m%2) ){ swap--; } // allocate space in vector memory for vectors v_fr = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_twr = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_temp = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); if( v_fr == NULL || v_fi == NULL || v_fr2 == NULL || v_fi2== NULL || \ v_twr == NULL || v_twi == NULL || v_temp == NULL) { VBX_EXIT(-1); } v_twr2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); if( v_twr2 == NULL || v_twi2 == NULL) { VBX_EXIT(-1); } vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) ); #if 1 if(real){ vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse); } #endif while (l > swap) { if (inverse) { // variable scaling, depending upon data shift = 0; if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) { shift = 1; scale++; } } else { // fixed scaling, for proper normalization // -- overall factor of 1/n, distributed to maximize arithmetic accuracy shift = 1; } // shift will be performed on each data point exactly once during pass SWAP( v_fr, v_fr2, v_tmp ); SWAP( v_fi, v_fi2, v_tmp ); if (shift){ vbx_set_vl( n ); vbx(SVH,VSHR, v_fr2, 1, v_fr2 ); vbx(SVH,VSHR, v_fi2, 1, v_fi2 ); } vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fr[n>>1], v_fr2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fi2, v_twi ); vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp ); vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fi[n>>1], v_fi2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fr2, v_twi ); vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VADD, &v_fi[n>>1], &v_fi[n>>1], v_temp ); l--; mul++; flight <<= 1; if( l > swap ) { vbx_set_vl( 1<<l ); vbx( VVWH, VMOV, v_twr, v_twr, 0 ); vbx( VVWH, VMOV, v_twi, v_twi, 0 ); } } if ( !(m%2) ) { l++; flight >>=1; }
void vbx_mtx_fdct( vbx_mtx_fdct_t *v, dt *block_v, dt *image, int start_x, int start_y, int end_x, int end_y,int num_tile_x, int num_tile_y ) { // vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); // const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int BIG_TILE_SIZE = num_tile_x * num_tile_y * DCT_SIZE; int next_x=start_x+1; int next_y=start_y; int get_next=1; if( start_x == end_x && start_y == end_y ) { get_next=0; } if( start_x == end_x ) { next_x = 0; next_y++; } const vbx_half_t *vimageDMA = v->vimage[!v->db]; // dma // const vbx_half_t *vblockDMA = v->vblock[!v->db]; // dma // never used directly const vbx_half_t *vimageVPU = v->vimage[ v->db]; // active const vbx_half_t *vblockVPU = v->vblock[ v->db]; // active const vbx_half_t *vblockTMP = v->vblock[ 2 ]; // temp const vbx_half_t *vcoeff = v->vcoeff; const vbx_half_t *vprods = v->vprods; const vbx_half_t *vaccum = v->vaccum; const vbx_half_t *vflags = v->vflags; #if DMA // First, prefetch the next chunk of the next image for a future call to fdct_tile() #if NUM_TILE_Y > 1 if( get_next ) // get row 0 getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 ); #else if( get_next ) // get row 0 getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 ); #endif #endif int r; for( r=0; r < BLOCK_SIZE; r++ ) { // perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching vbx_set_vl( NUM_TILE_X * BLOCK_SIZE ); // for the length of tiled rows vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0, NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows of tiled coeffiencents vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt), NUM_TILE_X * DCT_SIZE*sizeof(dt), 0 ); // for all groups Y vbx_3D( VVH, VMUL, vprods, vimageVPU + r*NUM_TILE_X*BLOCK_SIZE, vcoeff); // for all 'columns' of tiled data #if ACCUMULATE // accumulate the multiply operations #if 0 & USE_ACCUM_FLAGS vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods+0, vprods+1 ); vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) ); vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags ); #elif BLOCK4 //case DCT 4 vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods, vprods+1 ); vbx( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VADD, vaccum, vaccum, vprods+3 ); vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags ); #else //correct? vbx_set_vl( BLOCK_SIZE ); vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); vbx_set_3D( NUM_TILE_X, BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt) ); #if NUM_TILE_Y == 1 vbx_acc_3D( VVH, VOR, vblockTMP + r, vprods , vprods ); #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_acc_3D( VVH, VOR, vblockTMP + r + y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE ); } #endif #endif #endif #if 0 // dont do DMA READS here yet. a DMA WRITE may still be in progress, give it chance to finish #if DMA // every other iteration, prefetch the next row of the next image // NB: with 2D DMA, we could issue this as a single DMA request at the top of the file // instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue #if NUM_TILE_Y > 1 if( !(r&1) && get_next ) getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added #else if( !(r&1) && get_next ) getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added #endif #endif #endif } vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE ); vbx( SVH, VSHR, vblockTMP, SHIFT_AMOUNT, vblockTMP ); // now do the transposed version for( r=0; r < BLOCK_SIZE; r++ ) { // perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching vbx_set_vl( NUM_TILE_X * BLOCK_SIZE ); // for the length of tiled rows vbx_set_2D( BLOCK_SIZE, NUM_TILE_X * BLOCK_SIZE*sizeof(dt), NUM_TILE_X * BLOCK_SIZE*sizeof(dt), 0 ); // for all 'columns' of tiled data vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt), NUM_TILE_X * DCT_SIZE*sizeof(dt), 0 ); // for all groups Y vbx_3D( VVH, VMUL, vprods, vblockTMP, vcoeff + r*NUM_TILE_X*BLOCK_SIZE); // for all rows of tiled coeffients #if ACCUMULATE // accumulate the multiply operations #if 0 & USE_ACCUM_FLAGS vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods+0, vprods+1 ); vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) ); vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags ); #elif BLOCK4 //case DCT 4 vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods, vprods+1 ); vbx( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VADD, vaccum, vaccum, vprods+3 ); //vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags ); vbx_set_vl( NUM_TILE_X * BLOCK_SIZE - (BLOCK_SIZE-1) ); // for the length of a tiled row vbx_set_2D( BLOCK_SIZE, 1*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0);// for all tiled rows #if NUM_TILE_Y == 1 vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE, vaccum, vflags ); // #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE , vaccum+y*NUM_TILE_X*DCT_SIZE, vflags ); // } #endif #else //correct? vbx_set_vl( BLOCK_SIZE ); // for the length of a row vbx_set_2D( BLOCK_SIZE, sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows in that block vbx_set_3D( NUM_TILE_X, BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt) ); // for all tiled blocks horizontally(x) #if NUM_TILE_Y == 1 vbx_acc_3D( VVH, VOR, vblockVPU + r*NUM_TILE_X*BLOCK_SIZE , vprods , vprods ); #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_acc_3D( VVH, VOR, vblockVPU + r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE ); } #endif #endif #endif #if DMA // every other iteration, prefetch the next row of the next image // NB: with 2D DMA, we could issue this as a single DMA request at the top of the file // instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue #if NUM_TILE_Y > 1 //if( !(r&1) && r<(BLOCK_SIZE-1) && get_next ) if( get_next ) getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, r ); //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows #else //if( !(r&1) && r<(BLOCK_SIZE-1) && get_next ) if( get_next ) getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, r ); //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows #endif #endif } vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE ); vbx( SVH, VSHR, vblockVPU, SHIFT_AMOUNT, vblockVPU ); #if DMA2 // Write result back to memory as one big block vbx_dma_to_host( block_v, vblockVPU, BIG_TILE_SIZE*sizeof(dt) ); #endif v->db = !v->db; #ifdef DEBUG { vbx_sync(); int i,j; printf("%d\n", !db); for(i=0;i<BLOCK_SIZE*NUM_TILE_Y;i++){ for(j=0;j<BLOCK_SIZE*NUM_TILE_X;j++){ printf(" %4d", block_v[i*BLOCK_SIZE*NUM_TILE_X+j]); } printf("\n"); } } #endif }