/** VBX Motion Estimation, using vbx_3d ops. * vbw_mtx_motest_3D_byte_setup should be run prior to running this function. * Using bytes as input data. block_height must be an even number. * * @param[out] result * @param[in] x * @param[in] y * @param[in] m * @returns negative on error condition. See vbw_exit_codes.h */ int vbw_mtx_motest_3d_byte(output_type *result, input_type* x, input_type *y, vbw_motest_t *m) { int l,j; int sub_block_width = m->block_width+m->search_width; for( j = 0; j < m->block_height; j++ ) { vbx_dma_to_vector( m->v_block+j*m->block_width, x+j*m->image_width, m->block_width*sizeof(input_type) ); } for( j = 0; j < m->block_height+m->search_height; j++ ) { vbx_dma_to_vector( m->v_img+j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) ); } vbx_set_3D( m->search_width, m->block_height*sizeof(intermediate_type), sizeof(input_type), 0 ); for( l = 0; l < m->search_height; l++ ) { //Accumulate each row into a vbx of row SADs vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height, sizeof(intermediate_type), sub_block_width*sizeof(input_type), m->block_width*sizeof(input_type) ); vbx_acc_3D( VVBHU, VABSDIFF, m->v_row_sad, m->v_img+l*sub_block_width, m->v_block ); //Accumulate the SADs vbx_set_vl( m->block_height/2 ); vbx_set_2D( m->search_width, sizeof(output_type), m->block_height*sizeof(intermediate_type), m->block_height*sizeof(intermediate_type) ); vbx_acc_2D( VVHWU, VADD, (vbx_uword_t*)m->v_result+l*m->search_width, m->v_row_sad, m->v_row_sad+(m->block_height/2) ); //Transfer the line to host vbx_dma_to_host( result+l*m->search_width, m->v_result+l*m->search_width, m->search_width*sizeof(output_type) ); } return VBW_SUCCESS; }
int vbw_mtx_xp(vbx_sp_t *v_dst, vbx_sp_t *v_src, const int INROWS, const int INCOLS ) { vbx_set_vl( 1 ); vbx_set_2D( INCOLS, INROWS*sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbx_set_3D( INROWS, sizeof(vbx_sp_t), INCOLS*sizeof(vbx_sp_t), 0 ); vbxx_3D( VMOV, v_dst, v_src); return VBW_SUCCESS; }
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*. * This function uses a merge reverse algorithm that is faster on large vectors. * @pre v_src contains the elements to reverse. * @pre v_src, v_scratch0, and v_scratch1 must all be the same length. * @pre v_scratch1 and v_src must not overlap. * @pre v_src *may* overlap v_scratch0 (will clobber v_src). * @pre MXP must be 2 lanes or more. * @pre N is a multiple of SP_WIDTH_B. * @pre NUM_ROWS == N*4 / SP_WIDTH_B. * @pre v_mask must be SP_WIDTH_B bytes long. * @post v_scratch0 and v_scratch1 contents are modified, with one containing the result. * @post v_src clobbered only if v_src overlaps v_scratch0. * * @param[in] v_scratch1 *in scratch*. * @param[in] v_src *in scratch*. * @param[in] N is the number of words to reverse. * @param[in] v_scratch0 *in scratch*. * @param[in] v_mask *in scratch*. * @param[in] SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time. * @param[in] NUM_ROWS is the number of rows of length SP_WIDTH_B bytes. * @param[in] rot16 TRUE to swap upper and lower half-words of each word in result. * @returns the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1, * and will depend on log2(MXP vector lanes). */ static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0, vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 ) { #if !VBX_SKIP_ALL_CHECKS if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) { VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short."); VBX_EXIT(-1); } #endif vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 }; unsigned int W = SP_WIDTH_B/4/2; // half the number of words in a row unsigned int sel = 1; if( rot16 ) { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) ); } else { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 ); } vbx_set_vl( SP_WIDTH_B/4 ); vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 ); while( W > 1 ) { // set up odd/even mask register W /= 2; vbx( SEW, VAND, v_mask, W, 0 ); vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask ); vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask ); sel = !sel; } return v_scratch[sel]; }
/** VBX Motion Estimation. * Similar to the scalar version but scans vertically as it makes it easier to align vectors. * vbw_mtx_motest_byte_setup should be run prior to running this function. * * @param[out] result * @param[in] x * @param[in] y * @param[in] m * @returns negative on error condition. See vbw_exit_codes.h */ int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m) { int j; int sub_block_width = m->block_width+m->search_width; for( j = 0; j < m->block_height; j++ ) { vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) ); } for( j = 0; j < m->block_height+m->search_height; j++ ) { vbx_dma_to_vector( m->v_img +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) ); } // column-ize the reference block vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 ); vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 ); //Do column by column for( j=0; j < m->search_width; j++ ) { // column-ize the search image vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height+m->search_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 ); vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 ); // search the image columnwise vbx_set_vl( m->block_width*m->block_height ); vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0, m->block_width*sizeof(input_type) ); vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub ); } // Write back result vbx_dma_to_host( result, m->v_result, m->result_size ); return VBW_SUCCESS; }
int VBX_T(vbw_vec_reverse_test)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int i,k; vbx_timestamp_t start=0,finish=0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); NBYTES = sizeof(vbx_sp_t)*N; if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue; vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES ); vbx_sp_t *vdst = vbx_sp_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !vsrc ) VBX_EXIT(-1); if( !vdst ) VBX_EXIT(-1); #if ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF ) unsigned int mask = 0x007F; #elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF ) unsigned int mask = 0x7FFF; #else unsigned int mask = 0xFFFF; #endif vbx_set_vl( N ); vbx( SV(T), VMOV, vdst, -1, 0 ); // Fill the destination vector with -1 vbx( SE(T), VAND, vsrc, mask, 0 ); // Fill the source vector with enumerated values //VBX_T(print_vector)( "vsrcInit", vsrc, N ); //VBX_T(print_vector)( "vdstInit", vdst, N ); /** measure performance of function call **/ vbx_sync(); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N ); vbx_sync(); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); //VBX_T(print_vector)( "vsrcPost", vsrc, N ); //VBX_T(print_vector)( "vdstPost", vdst, N ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); vbx_set_vl( N ); vbx( SE(T), VAND, vsrc, mask, 0 ); // Reset the source vector /** measure performance of simple algorithm **/ vbx_sync(); vbx_set_vl( 1 ); vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 ); vbx_sync(); } finish = vbx_timestamp(); printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_sp_free(); } vbx_sp_free(); printf("All tests passed successfully.\n"); return 0; }
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N ) { const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int NUM_LANES = this_mxp->vector_lanes; //printf("\n%d\n",VBX_SKIP_ALL_CHECKS); // Can the whole vector fit in the scratchpad width? if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){ vbx_set_vl( 1 ); vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+N-1, v_src); return VBW_SUCCESS; } unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX); unsigned int N_w = N >> VBW_RSHIFT_T_TO_W; // Equivalent number of words in the vector if( N_w && N_w <= threshold_w ) { if( VBW_ROT16){ // remainder of elements that can't add to a whole word unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T); if( stub_t ) { vbx_set_vl( 1 ); vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t); v_dst += stub_t; } vec_rev_rot16_w(v_dst, v_src, N_w); }else{ vec_rev_w(v_dst, v_src, N_w); } if( VBW_ROT8){ vec_rot8_h(v_dst, v_dst, N_w*2); } return VBW_SUCCESS; } const unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; const unsigned int FREE_BYTES = vbx_sp_getfree(); const unsigned int ODD_LOG_SEL = NUM_LANES & 0x55555555 ? 1 : 0; vbx_word_t *v_mask, *v_result; vbx_word_t *v_scratch[2] = {0,0}; unsigned int num_rows_w = N_w / NUM_LANES; unsigned int working_set_w = num_rows_w * NUM_LANES; unsigned int tail_t = N - (working_set_w << VBW_LSHIFT_W_TO_T); unsigned int remaining_w = working_set_w; if( tail_t ) { vbx_set_vl( 1 ); vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t); v_dst += tail_t; } vbx_word_t *v_src_w = (vbx_word_t *)v_src; vbx_word_t *v_dst_w = (vbx_word_t *)v_dst; if(!num_rows_w) { return VBW_SUCCESS; } remaining_w = working_set_w; while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) { if( remaining_w <= threshold_w*2 ) { if( VBW_ROT16){ vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w); }else{ vec_rev_w(v_dst_w, v_src_w, remaining_w); } if( VBW_ROT8){ vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2); } return VBW_SUCCESS; } working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES ); v_mask = v_dst_w + (working_set_w*2); remaining_w -= working_set_w; v_scratch[0] = v_dst_w; v_scratch[1] = v_dst_w + working_set_w; num_rows_w = working_set_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, working_set_w*2); } v_dst_w += working_set_w; } vbx_sp_push(); v_scratch[0] = v_dst_w; v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) ); #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[1] ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B ); #if !VBX_SKIP_ALL_CHECKS if( !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif num_rows_w = remaining_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, remaining_w*2); } vbx_sp_pop(); return VBW_SUCCESS; }
/* takes in precomputed bfly */ static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real) { int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num; short wr, wi; vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp; vptr_half v_twr, v_twi; vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip; vptr_half v_temp; vptr_half v_twr2, v_twi2; const int n = 1 << m; const int half = n >> 1; scale = 0; mul = 0; swap = m >> 1; l = m-1; flight = 1; bfly = half; const int INROWS = 1<<swap; const int INCOLS = 1<<(m-swap); if ( !(m%2) ){ swap--; } // allocate space in vector memory for vectors v_fr = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_twr = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_temp = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); if( v_fr == NULL || v_fi == NULL || v_fr2 == NULL || v_fi2== NULL || \ v_twr == NULL || v_twi == NULL || v_temp == NULL) { VBX_EXIT(-1); } v_twr2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); if( v_twr2 == NULL || v_twi2 == NULL) { VBX_EXIT(-1); } vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) ); #if 1 if(real){ vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse); } #endif while (l > swap) { if (inverse) { // variable scaling, depending upon data shift = 0; if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) { shift = 1; scale++; } } else { // fixed scaling, for proper normalization // -- overall factor of 1/n, distributed to maximize arithmetic accuracy shift = 1; } // shift will be performed on each data point exactly once during pass SWAP( v_fr, v_fr2, v_tmp ); SWAP( v_fi, v_fi2, v_tmp ); if (shift){ vbx_set_vl( n ); vbx(SVH,VSHR, v_fr2, 1, v_fr2 ); vbx(SVH,VSHR, v_fi2, 1, v_fi2 ); } vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fr[n>>1], v_fr2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fi2, v_twi ); vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp ); vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fi[n>>1], v_fi2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fr2, v_twi ); vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VADD, &v_fi[n>>1], &v_fi[n>>1], v_temp ); l--; mul++; flight <<= 1; if( l > swap ) { vbx_set_vl( 1<<l ); vbx( VVWH, VMOV, v_twr, v_twr, 0 ); vbx( VVWH, VMOV, v_twi, v_twi, 0 ); } } if ( !(m%2) ) { l++; flight >>=1; }
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS ) { typedef vbx_mm_t vbx_sp_t; int elements = INROWS * INCOLS; if(elements < SCALAR_THRESHOLD) { vbx_sync(); //in case we input is waiting on a DMA transfer int i,j; for(i = 0; i < INROWS; i++) { for(j = 0; j < INCOLS; j++) { out[j*INROWS+i] = in[i*INCOLS+j]; } } return VBW_SUCCESS; } vbx_sp_push(); vbx_sp_t *v_in; vbx_sp_t *v_out; int tile_height = 0; int tile_width = 0; int prev_tile_width = 0; int tile_y = 0; int tile_x = 0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; int SP_SIZE = vbx_sp_getfree(); int max_sp_elements = vbx_sp_getfree() / sizeof(vbx_sp_t); int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t); if( INROWS == 1 || INCOLS == 1 ) { // 1D transpose becomes a simple copy operation if( elements <= max_sp_elements ) { // We can use the whole scratchpad for this v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // To test this, you'll need a very large 1D matrix (or a small SP) tile_width = max_sp_elements; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) ); for (tile_x = 0; tile_x < elements; tile_x += tile_width) { if( tile_x + tile_width > elements) tile_width = elements - tile_x; vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) ); } } } else if( elements < max_tile_elements ) { // Matrix is small enough to handle entirely in SP v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); vbw_mtx_xp(v_out,v_in,INROWS,INCOLS); vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // At this point we know at least one full tile will be needed #define QUICK_A_LANES_THRESHOLD 8 // Use merge transpose if there are at least this many lanes #define QUICK_A_TILE_WIDTH 128 #define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH) #define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2) #define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t))) #define QUICK_B_LANES_THRESHOLD 16 // Use smaller merge transpose tile only if there are a lot of lanes #define QUICK_B_TILE_WIDTH 64 // and only if larger tile A size cannot be used. #define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH) #define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2) #define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t))) int NUM_LANES = this_mxp->vector_lanes; int DMA_BYTES = this_mxp->dma_alignment_bytes; int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t); vbx_sp_t *v_out_sel; vbx_sp_t *vf = 0; if( NUM_LANES >= QUICK_A_LANES_THRESHOLD // Check for appropriate conditions to use merge transpose tiles && INCOLS >= QUICK_A_TILE_WIDTH && INROWS >= QUICK_A_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_A_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t)); } else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD && INCOLS >= QUICK_B_TILE_WIDTH && INROWS >= QUICK_B_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_B_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t)); } else { findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim ); } prev_tile_width = tile_width; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); if( v_out==NULL ) { vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } vbx_sp_t *v[2] = { v_in, v_out }; tile_y = 0; // Reset y position for new col while( tile_y < INROWS ) { vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); tile_x = 0; // Reset x position for new row while( tile_x < INCOLS ) { vbx_dma_to_vector_2D( v_in, in+(tile_y*INCOLS)+tile_x, tile_width*sizeof(vbx_mm_t), tile_height, tile_width*sizeof(vbx_sp_t), INCOLS*sizeof(vbx_mm_t) ); v_out_sel = v_out; // select v_out as default vector to DMA to MM /* *** merge transpose (matrix must be square and a power of 2 wide) *** */ if( vf && tile_width == tile_height && (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) { int src = 0; int n; for( n=1; n<tile_width; n *= 2 ) { // can't do 1st iteration until entire tile is DMA'd in const int nn = 2*n; // copy the destination matrix vbx_set_vl( tile_width*tile_width ); // use v_in & v_out as working matrices (clobber v_in) vbxx( VMOV, v[!src], v[src]); // do the work vbx_set_vl( n*tile_width ); vbxx( VAND, vf, n, (vbx_enum_t*)0 ); // mask for merging: 0101010... then 00110011... vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 ); vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n , vf ); vbxx_2D( VCMV_Z, v[!src]+n, v[src]+n*tile_width, vf ); src = !src; } v_out_sel = v[src]; // depending on the size of the mtx, the final result may be in v_in or v_out } else { vbx_set_vl( 1 ); // 2D and 3D will be set by the x and y edge conditions, even using merge vbxx_3D(VMOV, v_out, v_in ); } vbx_dma_to_host_2D( out+(tile_x*INROWS)+tile_y, v_out_sel, tile_height*sizeof(vbx_mm_t), tile_width, INROWS*sizeof(vbx_mm_t), tile_height*sizeof(vbx_sp_t) ); tile_x += tile_width; // Set up width for next tile if( tile_x + tile_width > INCOLS ) { // Temporarily reduce tile width when reaching right edge of matrix tile_width = INCOLS - tile_x; vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); } } tile_y += tile_height; // Set up width and height for next row of tiles tile_width = prev_tile_width; // Restore original tile width for next row of tiles /* *** Permanently reduce tile height when reaching bottom of matrix *** */ tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height; } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }
void vbx_mtx_fdct( vbx_mtx_fdct_t *v, dt *block_v, dt *image, int start_x, int start_y, int end_x, int end_y,int num_tile_x, int num_tile_y ) { // vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); // const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int BIG_TILE_SIZE = num_tile_x * num_tile_y * DCT_SIZE; int next_x=start_x+1; int next_y=start_y; int get_next=1; if( start_x == end_x && start_y == end_y ) { get_next=0; } if( start_x == end_x ) { next_x = 0; next_y++; } const vbx_half_t *vimageDMA = v->vimage[!v->db]; // dma // const vbx_half_t *vblockDMA = v->vblock[!v->db]; // dma // never used directly const vbx_half_t *vimageVPU = v->vimage[ v->db]; // active const vbx_half_t *vblockVPU = v->vblock[ v->db]; // active const vbx_half_t *vblockTMP = v->vblock[ 2 ]; // temp const vbx_half_t *vcoeff = v->vcoeff; const vbx_half_t *vprods = v->vprods; const vbx_half_t *vaccum = v->vaccum; const vbx_half_t *vflags = v->vflags; #if DMA // First, prefetch the next chunk of the next image for a future call to fdct_tile() #if NUM_TILE_Y > 1 if( get_next ) // get row 0 getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 ); #else if( get_next ) // get row 0 getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 ); #endif #endif int r; for( r=0; r < BLOCK_SIZE; r++ ) { // perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching vbx_set_vl( NUM_TILE_X * BLOCK_SIZE ); // for the length of tiled rows vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0, NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows of tiled coeffiencents vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt), NUM_TILE_X * DCT_SIZE*sizeof(dt), 0 ); // for all groups Y vbx_3D( VVH, VMUL, vprods, vimageVPU + r*NUM_TILE_X*BLOCK_SIZE, vcoeff); // for all 'columns' of tiled data #if ACCUMULATE // accumulate the multiply operations #if 0 & USE_ACCUM_FLAGS vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods+0, vprods+1 ); vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) ); vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags ); #elif BLOCK4 //case DCT 4 vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods, vprods+1 ); vbx( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VADD, vaccum, vaccum, vprods+3 ); vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags ); #else //correct? vbx_set_vl( BLOCK_SIZE ); vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); vbx_set_3D( NUM_TILE_X, BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt) ); #if NUM_TILE_Y == 1 vbx_acc_3D( VVH, VOR, vblockTMP + r, vprods , vprods ); #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_acc_3D( VVH, VOR, vblockTMP + r + y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE ); } #endif #endif #endif #if 0 // dont do DMA READS here yet. a DMA WRITE may still be in progress, give it chance to finish #if DMA // every other iteration, prefetch the next row of the next image // NB: with 2D DMA, we could issue this as a single DMA request at the top of the file // instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue #if NUM_TILE_Y > 1 if( !(r&1) && get_next ) getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added #else if( !(r&1) && get_next ) getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added #endif #endif #endif } vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE ); vbx( SVH, VSHR, vblockTMP, SHIFT_AMOUNT, vblockTMP ); // now do the transposed version for( r=0; r < BLOCK_SIZE; r++ ) { // perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching vbx_set_vl( NUM_TILE_X * BLOCK_SIZE ); // for the length of tiled rows vbx_set_2D( BLOCK_SIZE, NUM_TILE_X * BLOCK_SIZE*sizeof(dt), NUM_TILE_X * BLOCK_SIZE*sizeof(dt), 0 ); // for all 'columns' of tiled data vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt), NUM_TILE_X * DCT_SIZE*sizeof(dt), 0 ); // for all groups Y vbx_3D( VVH, VMUL, vprods, vblockTMP, vcoeff + r*NUM_TILE_X*BLOCK_SIZE); // for all rows of tiled coeffients #if ACCUMULATE // accumulate the multiply operations #if 0 & USE_ACCUM_FLAGS vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods+0, vprods+1 ); vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) ); vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags ); #elif BLOCK4 //case DCT 4 vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods, vprods+1 ); vbx( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VADD, vaccum, vaccum, vprods+3 ); //vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags ); vbx_set_vl( NUM_TILE_X * BLOCK_SIZE - (BLOCK_SIZE-1) ); // for the length of a tiled row vbx_set_2D( BLOCK_SIZE, 1*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0);// for all tiled rows #if NUM_TILE_Y == 1 vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE, vaccum, vflags ); // #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE , vaccum+y*NUM_TILE_X*DCT_SIZE, vflags ); // } #endif #else //correct? vbx_set_vl( BLOCK_SIZE ); // for the length of a row vbx_set_2D( BLOCK_SIZE, sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows in that block vbx_set_3D( NUM_TILE_X, BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt) ); // for all tiled blocks horizontally(x) #if NUM_TILE_Y == 1 vbx_acc_3D( VVH, VOR, vblockVPU + r*NUM_TILE_X*BLOCK_SIZE , vprods , vprods ); #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_acc_3D( VVH, VOR, vblockVPU + r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE ); } #endif #endif #endif #if DMA // every other iteration, prefetch the next row of the next image // NB: with 2D DMA, we could issue this as a single DMA request at the top of the file // instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue #if NUM_TILE_Y > 1 //if( !(r&1) && r<(BLOCK_SIZE-1) && get_next ) if( get_next ) getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, r ); //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows #else //if( !(r&1) && r<(BLOCK_SIZE-1) && get_next ) if( get_next ) getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, r ); //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows #endif #endif } vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE ); vbx( SVH, VSHR, vblockVPU, SHIFT_AMOUNT, vblockVPU ); #if DMA2 // Write result back to memory as one big block vbx_dma_to_host( block_v, vblockVPU, BIG_TILE_SIZE*sizeof(dt) ); #endif v->db = !v->db; #ifdef DEBUG { vbx_sync(); int i,j; printf("%d\n", !db); for(i=0;i<BLOCK_SIZE*NUM_TILE_Y;i++){ for(j=0;j<BLOCK_SIZE*NUM_TILE_X;j++){ printf(" %4d", block_v[i*BLOCK_SIZE*NUM_TILE_X+j]); } printf("\n"); } } #endif }
int vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int start_x, int start_y, int reset, const int image_width, const int image_height, const int image_pitch) { int y, x, starty, startx; unsigned int sad, sad_min, y_min, x_min; vbx_uhalf_t *v_search_luma, *v_last_luma; vbx_uhalf_t *v_row_temp; vbx_uword_t *v_row; vbx_uword_t *v_sad; pixel color; if(*last_luma == NULL || reset){ init_vector_motest(input_buffer, last_luma, motest_x, motest_y, start_x, start_y, image_pitch); } v_search_luma = vbx_sp_malloc( MOTEST_BUFFER_SIZE * sizeof(vbx_uhalf_t) ); v_last_luma = vbx_sp_malloc( MOTEST_BLOCK_SIZE * sizeof(vbx_uhalf_t) ); v_row_temp = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uhalf_t) ); v_row = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uword_t) ); v_sad = vbx_sp_malloc( MOTEST_SEARCH_SIZE * sizeof(vbx_uword_t) ); if(v_sad == NULL){ printf("Not enough scratchpad for motest\n"); while(1); } startx = *motest_x-(MOTEST_SEARCH_WIDTH/2); starty = *motest_y-(MOTEST_SEARCH_HEIGHT/2); if(startx < 0){ startx = 0; } if(startx > image_width-MOTEST_BUFFER_WIDTH){ startx = image_width-MOTEST_BUFFER_WIDTH; } if(starty < 0){ starty = 0; } if(starty > image_height-MOTEST_BUFFER_HEIGHT){ starty = image_height-MOTEST_BUFFER_HEIGHT; } vector_rectangle_to_luma(input_buffer, v_search_luma, v_row_temp, v_row, startx, starty, MOTEST_BUFFER_WIDTH, MOTEST_BUFFER_HEIGHT, image_pitch); vbx_dma_to_vector(v_last_luma, *last_luma, MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t)); //Vector compute sad here vbx_set_2D(MOTEST_BLOCK_HEIGHT, sizeof(vbx_uword_t), MOTEST_BUFFER_WIDTH*sizeof(vbx_uhalf_t), MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t)); for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ vbx_set_vl(MOTEST_BLOCK_WIDTH); vbx_acc_2D(VVHWU, VABSDIFF, v_row, v_search_luma+(y*MOTEST_BUFFER_WIDTH)+x, v_last_luma); vbx_set_vl(MOTEST_BLOCK_HEIGHT/2); vbx_acc(VVWU, VADD, v_sad+(y*MOTEST_SEARCH_WIDTH)+x, v_row, v_row+MOTEST_BLOCK_HEIGHT/2); } #if TOUCHSCREEN #ifdef TOUCH_INTERRUPTS_VBX if (touchscreen_get_pen(pTouch)) { vbx_sp_free(); return -1; } #endif #endif } vbx_sync(); sad_min = INT_MAX; y_min = *motest_y; x_min = *motest_x; for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ sad = v_sad[y*MOTEST_SEARCH_WIDTH+x]; if(sad < sad_min){ sad_min = sad; x_min = x+startx; y_min = y+starty; } else if(sad == sad_min) { if( (abs( x - MOTEST_SEARCH_WIDTH/2) + abs( y - MOTEST_SEARCH_HEIGHT/2)) < (abs((x_min-startx) - MOTEST_SEARCH_WIDTH/2) + abs((y_min-starty) - MOTEST_SEARCH_HEIGHT/2))) { x_min = x+startx; y_min = y+starty; } } } } color.r = 0; color.g = 255; color.b = 0; color.a = 0; scalar_draw_line(*motest_x+(MOTEST_BLOCK_WIDTH/2), *motest_y+(MOTEST_BLOCK_HEIGHT/2), x_min+(MOTEST_BLOCK_WIDTH/2), y_min+(MOTEST_BLOCK_HEIGHT/2), color, input_buffer, image_pitch); *motest_y = y_min; *motest_x = x_min; vbx_set_vl(MOTEST_BLOCK_WIDTH); for(y = 0; y < MOTEST_BLOCK_HEIGHT; y++){ vbx(VVHU, VMOV, v_last_luma+(y*MOTEST_BLOCK_WIDTH), v_search_luma+((y+y_min-starty)*MOTEST_BUFFER_WIDTH)+(x_min-startx), 0); } vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type)); draw_motest(input_buffer, *motest_x, *motest_y, image_pitch); //simple hack to draw thicker draw_motest(input_buffer, *motest_x+1, *motest_y+1, image_pitch); vbx_sp_free(); return 0; }