/// Apply [1 2 1] low-pass filter to raw input row /// NB: Last two output pixels are not meaningful inline static void vbw_sobel_3x3_row(vbx_uhalf_t *lpf, vbx_uhalf_t *raw, const short image_width) { vbx_set_vl(image_width-1); vbx(VVHU, VADD, lpf, raw, raw+1); vbx_set_vl(image_width-2); vbx(VVHU, VADD, lpf, lpf, lpf+1); }
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int width, int height, int max_print_errors) { int j, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(img, 0, width, height); vbx_ubyte_t* v_in = (vbx_ubyte_t*)vbx_sp_malloc(3*width*sizeof(vbx_word_t)); vbx_ubyte_t* v_top = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_ubyte_t* v_bot = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lbp = v_bot; unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); vbx_set_vl(width); for(j=0; j < height - 2; j++){ vbx_dma_to_vector(v_in, img+j*width, 3*width*sizeof(unsigned char)); vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+width); vbx(VVHU, VCUSTOM1, v_bot, v_in+width, v_in+2*width); vbx(SVHBU, VAND, v_top, 0xf0, v_top); vbx(SVHBU, VAND, v_bot, 0x0f, v_bot); vbx(VVBU, VADD, v_lbp, v_bot, v_top); vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char)); vbx_sync(); errors = match_array_byte(lbp, scalar_patterns[0]+j*width, "custom_lbp", width-2, 1, max_print_errors, 1, j); } vbx_sp_free(); vbx_shared_free(lbp); return errors; }
void vector_rectangle_to_luma( pixel *input_buffer, vbx_uhalf_t *v_luma_buffer, vbx_uhalf_t *v_row_temp, vbx_uword_t *v_row, int startx, int starty, int width, int height, const int image_pitch ) { int y; vbx_uhalf_t *v_luma; vbx_set_vl(width); for(y = 0; y < height; y++){ v_luma = v_luma_buffer+(y*width); vbx_dma_to_vector(v_row, input_buffer+((y+starty)*image_pitch)+startx, width*sizeof(vbx_uword_t)); //Move the b component into v_luma vbx(SVWHU, VAND, v_luma, 0xFF, v_row); vbx(SVHU, VMUL, v_luma, 25, v_luma); //Move g into v_row_temp and add it to v_luma vbx(SVWHU, VAND, v_row_temp, 0xFF, (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+1)); vbx(SVHU, VMUL, v_row_temp, 129, v_row_temp); vbx(VVHU, VADD, v_luma, v_luma, v_row_temp); //Move r into v_row_temp and add it to v_luma vbx(SVWHU, VAND, v_row_temp, 0xFF, (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+2)); vbx(SVHU, VMUL, v_row_temp, 66, v_row_temp); vbx(VVHU, VADD, v_luma, v_luma, v_row_temp); //divide by 2^8 vbx(SVHU, VSHR, v_luma, 8, v_luma); } }
int compare_vbx_lut_to_vbx_lut_ci(int sz, int max_print_errors) { int f, n, errors; vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); for (n = 0; n < sz; n++) { v_pattern[n] = n & 0xff; } int s, stage = 11; for (f = 0; f < face_lbp[stage].count; f++) { lbp_feat_t feat = face_lbp[stage].feats[f]; vbx_set_vl(sz); int total = f; s = 0; while(s < stage){ total += face_lbp[s].count; s++; } vbx(SVBU, VCUSTOM0, v_lutc, total, v_pattern); vbx(SVB, VMOV, v_pass, feat.fail, 0); /* check if pattern is in lut */ vbx(SVBU, VSHR, v_group, 5, v_pattern); for (n = 0; n < 8; n++) { vbx(SVB, VADD, v_sel, -n, v_group); vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel); } vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern); vbx(VVWB, VSHR, v_lut, v_idx, v_lut); vbx(SVB, VAND, v_lut, 1, v_lut); vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut); vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char)); vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char)); vbx_sync(); errors = match_array_byte(lut_c, lut, "custom_lut", sz, 1, max_print_errors, 0, 0); } vbx_sp_free(); vbx_shared_free(lut); vbx_shared_free(lut_c); return errors; }
int main(void) { vbx_test_init(); vbx_mxp_print_params(); int errors=0; unsigned instr_cycles,instr_count, dma_cycles,dma_count; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int lanes= this_mxp->vector_lanes; int dma_width=this_mxp->dma_alignment_bytes /4; debug(lanes); debug(dma_width); vbx_set_vl(-1); VBX_COUNTER_RESET(); vbx(SVW,VMOV,0,0,0); vbx_sync(); if(VBX_SIMULATOR) printf("simulator\n"); else printf("not simulator\n"); instr_cycles=VBX_GET_WRITEBACK_CYCLES(); dma_cycles=VBX_GET_DMA_CYCLES(); dma_count=VBX_GET_DMAS(); instr_count=VBX_GET_INSTRUCTIONS(); debug(instr_cycles); debug(dma_cycles); debug(dma_count); debug(instr_count ); VBX_TEST_END(errors); return 0; }
int deep_vector_copy_test() { int retval; int num_test; int total_errors = 0; const int NUM_TESTS = TEST_DEEP_SP_NUM_TESTS; const int NB = vbx_sp_getfree(); int NT = NB / sizeof(vbx_sp_t); vbx_sp_push(); vbx_sp_t *v = vbx_sp_malloc( NB ); srand( 0x1a84c92a ); for( num_test=0; num_test < NUM_TESTS ; num_test++ ) { // initialize entire available scratchpad vbx_set_vl( NT ); vbx( SE(T), VAND, v, MSK, 0 ); // choose random src/dest/length: // -- randomly pick the dest // -- set a window size of 2*K around the dest // -- randomly pick the src within the window // -- randomly pick the length, subject to end-of-scratchpad // -- this 'window' rule increases probability of overlaps // -- rough distribution: 30% short (pipeline) overlaps, 20% long overlaps, 50% no overlap int K, N1, N2, NN; N1 = rand() % NT; K = 1 + rand() % ((N1 > 0)? min(min(N1, NT-N1), 1024): min(NT, 1024)); N2 = N1 - K + rand() % (2*K); NN = rand() % (NT - max(N1,N2)); vbx_sp_t *dst = v + N1; vbx_sp_t *src = v + N2; printf("test:%d src:0x%08x dst:0x%08x len:%08d", num_test, N1, N2, NN ); // do the copy retval = VBX_T(vbw_vec_copy)( dst, src, NN ); vbx_sync(); printf(" retval:0x%04x\n",retval); // ensure the copy was done properly int errors = verify_copy((vbx_mm_t *)v, 0, N1, 0, "head") + verify_copy((vbx_mm_t *)v, N1, NN+N1, (N2-N1), "copy") + verify_copy((vbx_mm_t *)v, NN+N1, NT, 0, "tail"); total_errors += errors; if( errors ) { //break; } } vbx_sp_pop(); return total_errors; }
vbx_mtx_fdct_t * vbx_mtx_fdct_init( dt *coeff_v, dt *image ) { const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE; const int num_bytes = BIG_TILE_SIZE * sizeof(dt); const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt); //compute coeffs matrix in double and truncated to dt int i, j; double s; for (i = 0; i < BLOCK_SIZE; i++) { s = (i == 0) ? sqrt(0.125) : 0.5; for (j = 0; j < BLOCK_SIZE; j++) { c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5)); cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999); } } vbx_sp_push(); vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) ); v->vcoeff = (vbx_half_t *)vbx_sp_malloc( co_bytes ); v->vprods = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #if USE_ACCUM_FLAGS v->vaccum = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vflags = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #endif // interleave ordering to ensure no false hazards v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); if( !v->vblock[1] ) { VBX_PRINTF( "ERROR: out of memory.\n" ); VBX_EXIT(-1); } vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes ); int row; for( row=0; row < BLOCK_SIZE; row++ ) { getBigTileImageY(v->vimage[v->db],image,row); } #if USE_ACCUM_FLAGS // create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( SEH, VAND, v->vflags, BLOCK_SIZE-1, 0 ); #endif return v; }
/// Convert a row of aRGB pixels into luma values /// v_luma should not equal v_row_in /// Trashes v_temp static void vbw_rgb2luma(vbx_uhalf_t *v_luma, vbx_uword_t *v_row_in, vbx_uhalf_t *v_temp, const int image_width) { vbx_set_vl(image_width); // Move weighted B into v_luma vbx(SVWHU, VAND, v_temp, 0xFF, v_row_in); vbx(SVHU, VMUL, v_luma, 25, v_temp); // Move weighted G into v_temp and add it to v_luma vbx(SVWHU, VAND, v_temp, 0xFF, (vbx_uword_t*)(((vbx_ubyte_t *)v_row_in)+1)); vbx(SVHU, VMUL, v_temp, 129, v_temp); vbx(VVHU, VADD, v_luma, v_luma, v_temp); // Move weighted R into v_temp and add it to v_luma vbx(SVWHU, VAND, v_temp, 0xFF, (vbx_uword_t*)(((vbx_ubyte_t *)v_row_in)+2)); vbx(SVHU, VMUL, v_temp, 66, v_temp); vbx(VVHU, VADD, v_luma, v_luma, v_temp); vbx(SVHU, VADD, v_luma, 128, v_luma); // for rounding vbx(SVHU, VSHR, v_luma, 8, v_luma); }
static int isAbsOutOfRangeV( vptr_half v_src_r, vptr_half v_src_i, vptr_half v_temp, int n ) { //used for inverse only vbx_set_vl(n); vbx(SVH, VABSDIFF, v_temp, 0, v_src_r ); // get abs value of real vbx(SVH, VSUB, v_temp, 16383, v_temp ); // if (16383 - v_src) < 0, needs scaling vbx_acc(SVH, VCMV_LTZ, v_temp, 1, v_temp ); // accum # of neg values to see if scaling required vbx_sync(); if( v_temp[0] ){ return 1; } vbx(SVH, VABSDIFF, v_temp, 0, v_src_i ); // get abs value of imag vbx(SVH, VSUB, v_temp, 16383, v_temp ); // if (16383 - v_src) < 0, needs scaling vbx_acc(SVH, VCMV_LTZ, v_temp, 1, v_temp ); // accum # of neg values to see if scaling required vbx_sync(); if( v_temp[0] ){ return 1; } return 0; }
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int log, int width, int height, int max_print_errors) { int j, l, cell, max_cell, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(img, log, width, height); max_cell = 1<<log; vbx_uhalf_t* v_in = (vbx_uhalf_t*)vbx_sp_malloc((1+2*max_cell)*width*sizeof(vbx_half_t)); vbx_uhalf_t* v_top = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t)); vbx_uhalf_t* v_bot = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t)); vbx_ubyte_t* v_lbp = (vbx_ubyte_t*)v_bot; unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); vbx_set_vl(width); for(l = 0; l < 1; l++){ cell = 1<<l; for(j=0; j < height - 2*cell; j++){ vbx_dma_to_vector(v_in, img+j*width, (1+2*cell)*width*sizeof(unsigned short)); vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+(1*cell)*width); vbx(VVHU, VCUSTOM1, v_bot, v_in+(1*cell)*width, v_in+(2*cell)*width); vbx(SVHBU, VAND, (vbx_ubyte_t*)v_top, 0xf0, v_top); vbx(SVHBU, VAND, (vbx_ubyte_t*)v_bot, 0x0f, v_bot); vbx(VVBU, VADD, v_lbp, v_bot, v_top); vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char)); vbx_sync(); errors += match_array_byte(lbp, scalar_patterns[l]+j*width, "custom_lbp", width-2*cell, 1, 0, max_print_errors, 1, j); if (errors > max_print_errors){ max_print_errors = 0; } } } vbx_sp_free(); vbx_shared_free(lbp); return errors; }
int vbw_sobel_argb32_3x3(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm) { size_t free_sp=vbx_sp_getfree(); size_t vectors_needed=8; size_t partial_width=free_sp/(vectors_needed*sizeof(vbx_uword_t)); if(partial_width>image_width){ vbw_sobel_argb32_3x3_partial(output, input, image_width, image_height, image_pitch,renorm); }else{ //can do entire row at a time, so do partial_width at a time size_t partial_step=partial_width-2; int i; for(i=0;;i+=partial_step){ //account for last tile being smaller if(i+partial_width > image_width){ partial_width=image_width-i; } vbw_sobel_argb32_3x3_partial(output+i, input+i, partial_width, image_height, image_pitch,renorm); if(i+partial_width == image_width){ //that was the last tile, so break, //I don't believe that this can be in the for statement break; } } } vbx_sp_push(); vbx_word_t* side=vbx_sp_malloc(sizeof(vbx_word_t)); vbx_set_vl(1); vbx(SVW,VMOV,side,0,0); vbx_dma_to_host_2D(output,/*host_ptr*/ side,/*sp_ptr*/ sizeof(vbx_word_t),/*row len*/ image_height,/*num rows*/ image_pitch*sizeof(vbx_word_t),/*host_incr*/ 0);/*sp incr*/ vbx_dma_to_host_2D(output+image_width-1,/*host_ptr*/ side,/*sp_ptr*/ sizeof(vbx_word_t),/*row len*/ image_height,/*num rows*/ image_pitch*sizeof(vbx_word_t),/*host_incr*/ 0);/*sp incr*/ vbx_sp_pop(); vbx_sync(); }
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*. * This function uses a merge reverse algorithm that is faster on large vectors. * @pre v_src contains the elements to reverse. * @pre v_src, v_scratch0, and v_scratch1 must all be the same length. * @pre v_scratch1 and v_src must not overlap. * @pre v_src *may* overlap v_scratch0 (will clobber v_src). * @pre MXP must be 2 lanes or more. * @pre N is a multiple of SP_WIDTH_B. * @pre NUM_ROWS == N*4 / SP_WIDTH_B. * @pre v_mask must be SP_WIDTH_B bytes long. * @post v_scratch0 and v_scratch1 contents are modified, with one containing the result. * @post v_src clobbered only if v_src overlaps v_scratch0. * * @param[in] v_scratch1 *in scratch*. * @param[in] v_src *in scratch*. * @param[in] N is the number of words to reverse. * @param[in] v_scratch0 *in scratch*. * @param[in] v_mask *in scratch*. * @param[in] SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time. * @param[in] NUM_ROWS is the number of rows of length SP_WIDTH_B bytes. * @param[in] rot16 TRUE to swap upper and lower half-words of each word in result. * @returns the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1, * and will depend on log2(MXP vector lanes). */ static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0, vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 ) { #if !VBX_SKIP_ALL_CHECKS if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) { VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short."); VBX_EXIT(-1); } #endif vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 }; unsigned int W = SP_WIDTH_B/4/2; // half the number of words in a row unsigned int sel = 1; if( rot16 ) { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) ); } else { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 ); } vbx_set_vl( SP_WIDTH_B/4 ); vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 ); while( W > 1 ) { // set up odd/even mask register W /= 2; vbx( SEW, VAND, v_mask, W, 0 ); vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask ); vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask ); sel = !sel; } return v_scratch[sel]; }
int test_lbp_ci(unsigned short* img, int width, int height) { vbx_uhalf_t* v_a1 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b1 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_1h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_a2 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b2 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_2h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_a4 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b4 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_4h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_ubyte_t* v_1b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); vbx_ubyte_t* v_2b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); vbx_ubyte_t* v_4b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); unsigned short* lbp1h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned short* lbp2h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned short* lbp4h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned char* lbp1b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); unsigned char* lbp2b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); unsigned char* lbp4b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); img = img + width; vbx_dma_to_vector(v_a1, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b1, img + width, width*sizeof(unsigned short)); vbx_dma_to_vector(v_a2, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b2, img + width, width*sizeof(unsigned short)); vbx_dma_to_vector(v_a4, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b4, img + width, width*sizeof(unsigned short)); vbx_sync(); int i; int m = 48; for(i=0; i<m; i++){ v_a1[i] = 0; v_b1[i] = 0; v_a2[i] = 0; v_b2[i] = 0; v_a4[i] = 0; v_b4[i] = 0; } int n = 12; int src_a1[] = {0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_b1[] = {0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_a2[] = {0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_b2[] = {0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_a4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0}; int src_b4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0}; for(i=0; i<16; i++){ v_a1[i] = src_a1[i]; v_b1[i] = src_b1[i]; v_a2[i] = src_a2[i]; v_b2[i] = src_b2[i]; v_a4[i] = src_a4[i]; v_b4[i] = src_b4[i]; } vbx_set_vl(width); vbx(VVHU, VCUSTOM1, v_1h, v_a1, v_b1); vbx(VVHU, VCUSTOM2, v_2h, v_a2, v_b2); vbx(VVHU, VCUSTOM3, v_4h, v_a4, v_b4); vbx(VVHB, VADD, v_1b, v_1h, ((vbx_byte_t*)v_1h) + 1); vbx(VVHB, VADD, v_2b, v_2h, ((vbx_byte_t*)v_2h) + 1); vbx(VVHB, VADD, v_4b, v_4h, ((vbx_byte_t*)v_4h) + 1); vbx_dma_to_host(lbp1h, v_1h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp2h, v_2h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp4h, v_4h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp1b, v_1b, width*sizeof(unsigned char)); vbx_dma_to_host(lbp2b, v_2b, width*sizeof(unsigned char)); vbx_dma_to_host(lbp4b, v_4b, width*sizeof(unsigned char)); vbx_sync(); test_print_array_half(v_a1, n); test_print_array_half(v_b1, n); test_print_hex_array_half(lbp1h, n); test_print_hex_array_byte(lbp1b, n); test_print_array_half(v_a2, n); test_print_array_half(v_b2, n); test_print_hex_array_half(lbp2h, n); test_print_hex_array_byte(lbp2b, n); test_print_array_half(v_a4, n); test_print_array_half(v_b4, n); test_print_hex_array_half(lbp4h, n); test_print_hex_array_byte(lbp4b, n); vbx_sp_free(); vbx_shared_free(lbp1h); vbx_shared_free(lbp2h); vbx_shared_free(lbp4h); vbx_shared_free(lbp1b); vbx_shared_free(lbp2b); vbx_shared_free(lbp4b); return 0; }
//FIXME stride for match not implemented int compare_LBPPassStage_to_restricted(unsigned short *vbx_img, int log, lbp_stage_t lbp_stage, int window, int width, int height, int max_print_errors) { int l, i, j, cell, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(vbx_img, log, width, height); unsigned char *pass, *vbx_pass; pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char)); vbx_pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char)); vbx_byte_t** v_lbp =(vbx_byte_t**)vbx_shared_malloc((log+1)*sizeof(vbx_byte_t*)); for (l=0; l<log+1; l++) { v_lbp[l] = (vbx_byte_t*)vbx_sp_malloc((window+1)*width*sizeof(vbx_byte_t)); } vbx_byte_t* v_lut = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_byte_t* v_stage = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_byte_t* v_pattern; lbp_feat_t feat; int dx, dy, dw, f; for (l=0; l<log+1; l++) { vbx_dma_to_vector(v_lbp[l]+width, scalar_patterns[l], (window)*width*sizeof(unsigned char)); } vbx_sync(); for(j=0; j < height-(window+1); j++) { for (l=0; l<log+1; l++) { vbx_set_vl(width * window); vbx(VVB, VMOV, v_lbp[l], v_lbp[l]+width, NULL); vbx_dma_to_vector(v_lbp[l] + window*width, scalar_patterns[l]+(j+window)*width, width*sizeof(unsigned char)); } vbx_set_vl(width-(window+1)); vbx(SVB, VMOV, v_stage, 0, NULL); for (f = 0; f < lbp_stage.count; f++) { feat = lbp_stage.feats[f]; dx = feat.pos.src.x; dy = feat.pos.src.y; dw = feat.pos.size.x; v_pattern = v_lbp[dw>>1]+(dy*width+dx); vbx(SVBU, VLBPLUT, v_lut, f, v_pattern); vbx(VVB, VADD, v_stage, v_stage, v_lut); } vbx(SVB, VMOV, v_lut, 0, NULL); vbx(SVB, VCMV_GEZ, v_lut, 1, v_stage); vbx_dma_to_host(vbx_pass + j*width, v_lut, (width-(window+1))*sizeof(unsigned char)); vbx_sync(); } unsigned int *iImg, *iiImg; iImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int)); iiImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int)); gen_integrals(vbx_img, iImg, iiImg, width, height); image_t lbp_img = {iImg, {width, height}}; for (j = 0; j < height - (window + 1); j++) { for (i = 0; i < width - (window + 1); i++) { pair_t lbp_p = {i, j}; pass[j*width+i] = LBPPassStage(lbp_img, lbp_stage, lbp_p); } } /* test pass vs vbx pass */ for (j = 0; j < height - (window + 1); j++) { errors += match_array_byte(vbx_pass + j*width, pass + j*width, "pass stage", width - (window + 1), 1, 0, max_print_errors, 1, j); if (errors > max_print_errors){ max_print_errors = 0; } } return errors; }
int VectorBlox_MXP_Initialize(const char* mxp_dev,const char* cma_dev) { PAGE_SIZE=sysconf(_SC_PAGESIZE); PAGE_SHIFT=0; int page_size=PAGE_SIZE; while((page_size>>=1)){ PAGE_SHIFT++; } char filename[256]; sprintf(filename,"/dev/%s",mxp_dev); the_mxp.mxp_fd=open(filename,O_RDWR); assert(the_mxp.mxp_fd); the_mxp.scratchpad_size = get_attr_from_file(mxp_dev,"SCRATCHPAD_KB") * 1024; the_mxp.scratchpad_addr = (void*)get_attr_from_file(mxp_dev,"C_S_AXI_BASEADDR"); void* scratchpad_mmap = mmap(the_mxp.scratchpad_addr, //for the mapping to be the same as the physical mapping the_mxp.scratchpad_size, PROT_READ|PROT_WRITE,MAP_SHARED|MAP_FIXED, the_mxp.mxp_fd,4096); assert(scratchpad_mmap == the_mxp.scratchpad_addr); the_mxp.scratchpad_end = (void*)get_attr_from_file(mxp_dev,"C_S_AXI_HIGHADDR")+1; //M_AXI_DATA_WIDTH is in bits, convert to bytes the_mxp.dma_alignment_bytes = get_attr_from_file(mxp_dev,"C_M_AXI_DATA_WIDTH")/8; the_mxp.vector_lanes = get_attr_from_file(mxp_dev,"VECTOR_LANES"); the_mxp.scratchpad_alignment_bytes = the_mxp.vector_lanes * 4; the_mxp.vcustom0_lanes = get_attr_from_file(mxp_dev, "VCI_0_LANES"); the_mxp.vcustom1_lanes = get_attr_from_file(mxp_dev, "VCI_1_LANES"); the_mxp.vcustom2_lanes = get_attr_from_file(mxp_dev, "VCI_2_LANES"); the_mxp.vcustom3_lanes = get_attr_from_file(mxp_dev, "VCI_3_LANES"); the_mxp.vcustom4_lanes = get_attr_from_file(mxp_dev, "VCI_4_LANES"); the_mxp.vcustom5_lanes = get_attr_from_file(mxp_dev, "VCI_5_LANES"); the_mxp.vcustom6_lanes = get_attr_from_file(mxp_dev, "VCI_6_LANES"); the_mxp.vcustom7_lanes = get_attr_from_file(mxp_dev, "VCI_7_LANES"); the_mxp.vcustom8_lanes = get_attr_from_file(mxp_dev, "VCI_8_LANES"); the_mxp.vcustom9_lanes = get_attr_from_file(mxp_dev, "VCI_9_LANES"); the_mxp.vcustom10_lanes = get_attr_from_file(mxp_dev, "VCI_10_LANES"); the_mxp.vcustom11_lanes = get_attr_from_file(mxp_dev, "VCI_11_LANES"); the_mxp.vcustom12_lanes = get_attr_from_file(mxp_dev, "VCI_12_LANES"); the_mxp.vcustom13_lanes = get_attr_from_file(mxp_dev, "VCI_13_LANES"); the_mxp.vcustom14_lanes = get_attr_from_file(mxp_dev, "VCI_14_LANES"); the_mxp.vcustom15_lanes = get_attr_from_file(mxp_dev, "VCI_15_LANES"); the_mxp.mask_partitions = get_attr_from_file(mxp_dev,"MASK_PARTITIONS"); the_mxp.max_masked_vector_length = get_attr_from_file(mxp_dev,"MAX_MASKED_WAVES")* the_mxp.vector_lanes * 4; the_mxp.fxp_word_frac_bits = get_attr_from_file(mxp_dev,"MULFXP_WORD_FRACTION_BITS"); the_mxp.fxp_half_frac_bits = get_attr_from_file(mxp_dev,"MULFXP_HALF_FRACTION_BITS"); the_mxp.fxp_byte_frac_bits = get_attr_from_file(mxp_dev,"MULFXP_BYTE_FRACTION_BITS"); the_mxp.core_freq = get_attr_from_file(mxp_dev,"CLOCK_FREQ_HZ"); the_mxp.instr_port_addr = mmap(NULL,PAGE_SIZE,PROT_READ|PROT_WRITE,MAP_SHARED,the_mxp.mxp_fd,0); sprintf(filename,"/dev/%s",cma_dev); the_mxp.cma_fd = open(filename,O_RDWR); assert(the_mxp.cma_fd); the_mxp.init = 0; the_mxp.sp = the_mxp.scratchpad_addr; the_mxp.spstack = (vbx_void_t **) NULL; the_mxp.spstack_top = (int) 0; the_mxp.spstack_max = (int) 0; _vbx_init(&the_mxp); //clear scratchpad vbx_set_vl(the_mxp.scratchpad_size); vbx(SVB,VMOV,(vbx_byte_t*)the_mxp.scratchpad_addr,0,0); return 0; }
int VBX_T(vbw_vec_reverse_test)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int i,k; vbx_timestamp_t start=0,finish=0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); NBYTES = sizeof(vbx_sp_t)*N; if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue; vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES ); vbx_sp_t *vdst = vbx_sp_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !vsrc ) VBX_EXIT(-1); if( !vdst ) VBX_EXIT(-1); #if ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF ) unsigned int mask = 0x007F; #elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF ) unsigned int mask = 0x7FFF; #else unsigned int mask = 0xFFFF; #endif vbx_set_vl( N ); vbx( SV(T), VMOV, vdst, -1, 0 ); // Fill the destination vector with -1 vbx( SE(T), VAND, vsrc, mask, 0 ); // Fill the source vector with enumerated values //VBX_T(print_vector)( "vsrcInit", vsrc, N ); //VBX_T(print_vector)( "vdstInit", vdst, N ); /** measure performance of function call **/ vbx_sync(); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N ); vbx_sync(); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); //VBX_T(print_vector)( "vsrcPost", vsrc, N ); //VBX_T(print_vector)( "vdstPost", vdst, N ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); vbx_set_vl( N ); vbx( SE(T), VAND, vsrc, mask, 0 ); // Reset the source vector /** measure performance of simple algorithm **/ vbx_sync(); vbx_set_vl( 1 ); vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 ); vbx_sync(); } finish = vbx_timestamp(); printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_sp_free(); } vbx_sp_free(); printf("All tests passed successfully.\n"); return 0; }
/** Luma Edge Detection * * @brief 3x3 Sobel edge detection with 32-bit aRGB image * * @param[out] output 32-bit aRGB edge-intensity output * @param[in] input 32-bit aRGB input * @param[in] image_width Image width in pixels * @param[in] image_height Image height in pixels * @param[in] image_pitch Distance in pixels between the start of subsequent rows. usually equal to image_width * @param[in] renorm Number of bits to shift the final intensity by to the right * @returns Negative on error condition. See vbw_exit_codes.h */ int vbw_sobel_argb32_3x3_partial(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm) { int y; vbx_uword_t *v_row_in; vbx_uhalf_t *v_luma_top, *v_luma_mid, *v_luma_bot; vbx_uword_t *v_row_out; vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot; vbx_uhalf_t *v_gradient_x, *v_gradient_y; vbx_uhalf_t *v_tmp; void *tmp_ptr; vbx_sp_push(); // Allocate space in scratchpad for vectors struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t), input,input+image_pitch*image_width, image_pitch*sizeof(vbx_uword_t)); v_luma_top = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_luma_mid = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_luma_bot = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_top = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_mid = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_bot = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_row_out = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t)); if(v_row_out==NULL){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } // Re-use v_sobel_row_bot as v_tmp v_tmp = v_sobel_row_bot; // Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations rp_fetch(&v_row_db); rp_fetch(&v_row_db); v_row_in=rp_get_buffer(&v_row_db,0); vbw_rgb2luma(v_luma_top, v_row_in, v_tmp, image_width); // 1st luma row vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top, image_width); // 1st partial sobel row rp_fetch(&v_row_db); v_row_in=rp_get_buffer(&v_row_db,0); vbw_rgb2luma(v_luma_mid, v_row_in, v_tmp, image_width); // 2nd luma row vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width); // 2nd partial sobel row // Set top output row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t)); // Calculate edges for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) { // Transfer the next input row while processing rp_fetch(&v_row_db); v_row_in=rp_get_buffer(&v_row_db,0); // Re-use v_sobel_row_bot as v_tmp v_tmp = v_sobel_row_bot; // Convert aRGB input to luma vbw_rgb2luma(v_luma_bot, v_row_in, v_tmp, image_width); // Done with v_row_in; re-use for v_gradient_x and v_gradient_y (be careful!) v_gradient_x = (vbx_uhalf_t *)v_row_in; v_gradient_y = (vbx_uhalf_t *)v_row_in + image_width; // Calculate gradient_x // Apply [1 2 1]T matrix to all columns vbx_set_vl(image_width); vbx(SVHU, VSHL, v_gradient_x, 1, v_luma_mid); // multiply by 2 vbx(VVHU, VADD, v_tmp, v_luma_top, v_luma_bot); vbx(VVHU, VADD, v_tmp, v_tmp, v_gradient_x); // For each column, calculate absolute difference with 2nd column to the right vbx_set_vl(image_width-2); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2); // Calculate gradient_y // Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot); // Re-use v_sobel_row_top as v_tmp v_tmp = v_sobel_row_top; // sum of absoute gradients vbx_set_vl(image_width-2); vbx(VVHU, VADD, v_tmp, v_gradient_x, v_gradient_y); vbx(SVHU, VSHR, v_tmp, renorm, v_tmp); // Threshold vbx(SVHU, VSUB, v_gradient_y, 255, v_tmp); vbx(SVHU, VCMV_LTZ, v_tmp, 255, v_gradient_y); // Copy the result to the low byte of the output row // Trick to copy the low byte (b) to the middle two bytes as well // Note that first and last columns are 0 vbx_set_vl(image_width-2); vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp); // DMA the result to the output (minus the outside two pixels vbx_dma_to_host(output+(y+1)*image_pitch+1, v_row_out+1, (image_width-2)*sizeof(vbx_uword_t)); // Rotate luma buffers tmp_ptr = (void *)v_luma_top; v_luma_top = v_luma_mid; v_luma_mid = v_luma_bot; v_luma_bot = (vbx_uhalf_t *)tmp_ptr; // Rotate v_sobel_row buffers (for gradient_y) tmp_ptr = (void *)v_sobel_row_top; v_sobel_row_top = v_sobel_row_mid; v_sobel_row_mid = v_sobel_row_bot; v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr; } // Set bottom row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t)); vbx_sp_pop(); return VBW_SUCCESS; }
int vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int start_x, int start_y, int reset, const int image_width, const int image_height, const int image_pitch) { int y, x, starty, startx; unsigned int sad, sad_min, y_min, x_min; vbx_uhalf_t *v_search_luma, *v_last_luma; vbx_uhalf_t *v_row_temp; vbx_uword_t *v_row; vbx_uword_t *v_sad; pixel color; if(*last_luma == NULL || reset){ init_vector_motest(input_buffer, last_luma, motest_x, motest_y, start_x, start_y, image_pitch); } v_search_luma = vbx_sp_malloc( MOTEST_BUFFER_SIZE * sizeof(vbx_uhalf_t) ); v_last_luma = vbx_sp_malloc( MOTEST_BLOCK_SIZE * sizeof(vbx_uhalf_t) ); v_row_temp = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uhalf_t) ); v_row = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uword_t) ); v_sad = vbx_sp_malloc( MOTEST_SEARCH_SIZE * sizeof(vbx_uword_t) ); if(v_sad == NULL){ printf("Not enough scratchpad for motest\n"); while(1); } startx = *motest_x-(MOTEST_SEARCH_WIDTH/2); starty = *motest_y-(MOTEST_SEARCH_HEIGHT/2); if(startx < 0){ startx = 0; } if(startx > image_width-MOTEST_BUFFER_WIDTH){ startx = image_width-MOTEST_BUFFER_WIDTH; } if(starty < 0){ starty = 0; } if(starty > image_height-MOTEST_BUFFER_HEIGHT){ starty = image_height-MOTEST_BUFFER_HEIGHT; } vector_rectangle_to_luma(input_buffer, v_search_luma, v_row_temp, v_row, startx, starty, MOTEST_BUFFER_WIDTH, MOTEST_BUFFER_HEIGHT, image_pitch); vbx_dma_to_vector(v_last_luma, *last_luma, MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t)); //Vector compute sad here vbx_set_2D(MOTEST_BLOCK_HEIGHT, sizeof(vbx_uword_t), MOTEST_BUFFER_WIDTH*sizeof(vbx_uhalf_t), MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t)); for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ vbx_set_vl(MOTEST_BLOCK_WIDTH); vbx_acc_2D(VVHWU, VABSDIFF, v_row, v_search_luma+(y*MOTEST_BUFFER_WIDTH)+x, v_last_luma); vbx_set_vl(MOTEST_BLOCK_HEIGHT/2); vbx_acc(VVWU, VADD, v_sad+(y*MOTEST_SEARCH_WIDTH)+x, v_row, v_row+MOTEST_BLOCK_HEIGHT/2); } #if TOUCHSCREEN #ifdef TOUCH_INTERRUPTS_VBX if (touchscreen_get_pen(pTouch)) { vbx_sp_free(); return -1; } #endif #endif } vbx_sync(); sad_min = INT_MAX; y_min = *motest_y; x_min = *motest_x; for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ sad = v_sad[y*MOTEST_SEARCH_WIDTH+x]; if(sad < sad_min){ sad_min = sad; x_min = x+startx; y_min = y+starty; } else if(sad == sad_min) { if( (abs( x - MOTEST_SEARCH_WIDTH/2) + abs( y - MOTEST_SEARCH_HEIGHT/2)) < (abs((x_min-startx) - MOTEST_SEARCH_WIDTH/2) + abs((y_min-starty) - MOTEST_SEARCH_HEIGHT/2))) { x_min = x+startx; y_min = y+starty; } } } } color.r = 0; color.g = 255; color.b = 0; color.a = 0; scalar_draw_line(*motest_x+(MOTEST_BLOCK_WIDTH/2), *motest_y+(MOTEST_BLOCK_HEIGHT/2), x_min+(MOTEST_BLOCK_WIDTH/2), y_min+(MOTEST_BLOCK_HEIGHT/2), color, input_buffer, image_pitch); *motest_y = y_min; *motest_x = x_min; vbx_set_vl(MOTEST_BLOCK_WIDTH); for(y = 0; y < MOTEST_BLOCK_HEIGHT; y++){ vbx(VVHU, VMOV, v_last_luma+(y*MOTEST_BLOCK_WIDTH), v_search_luma+((y+y_min-starty)*MOTEST_BUFFER_WIDTH)+(x_min-startx), 0); } vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type)); draw_motest(input_buffer, *motest_x, *motest_y, image_pitch); //simple hack to draw thicker draw_motest(input_buffer, *motest_x+1, *motest_y+1, image_pitch); vbx_sp_free(); return 0; }
int vbw_mtx_median_ext_argb32( unsigned *output, unsigned *input, const int filter_height, const int filter_width, const int image_height, const int image_width, const int image_pitch ) { const int FREE_BYTES = vbx_sp_getfree(); int l,k; int filter_mid, filter_size; int rows_per_l,vl,temp_vl, temp_vl_byte; int j,i; int partial_row = 0; filter_size = filter_height*filter_width; filter_mid = filter_size/2; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_WIDTH_BYTES = this_mxp->scratchpad_alignment_bytes; // Could possibly check for low SP here (less than 6*VBX_WIDTH_BYTES) and assign vl differently // During allocation, max additional SP bytes needed due to alignment is one VBX_WIDTH_BYTES per vector // Taking that off the top simplifies calculation and will always be correct, but sacrifices a little SP space vl = (FREE_BYTES-3*VBX_WIDTH_BYTES)/((filter_size+2)*sizeof(vbx_uword_t)); if( vl < 1 ) { return VBW_ERROR_SP_ALLOC_FAILED; } if(vl < image_width){ rows_per_l = 1; partial_row = 1; } else { rows_per_l = vl/image_width; vl = image_width*rows_per_l; } vbx_sp_push(); vbx_uword_t *v_input = (vbx_uword_t *)vbx_sp_malloc(filter_size*vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_sub = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_temp = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_min, *v_max; vbx_ubyte_t *v_input_byte = (vbx_ubyte_t *)v_input; if( v_temp == NULL ){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } for(l = 0; l < image_height-filter_height; l+= rows_per_l){ // detect last pass if(l+rows_per_l > image_height-filter_height){ rows_per_l = (image_height-filter_height)-l; vl = image_width*rows_per_l; } temp_vl = vl; for(k = 0; k < image_width; k += temp_vl){ if(partial_row){ if(k + temp_vl > image_width){ temp_vl = image_width - k; } } for(j = 0; j < filter_height; j++){ vbx_dma_to_vector_2D(v_input+temp_vl*j, input+(l+j)*image_pitch+k, temp_vl/rows_per_l*sizeof(vbx_uword_t), rows_per_l, image_width*sizeof(vbx_uword_t), image_pitch*sizeof(vbx_uword_t)); } // arrange all pixels within a filter window into single columns, seperated by temp_vl // // ex. vl = 5, filter = 3 // vinput before vinput after // // a00 a01 a02 a03 a04 | a00 a01 a02 a03 a04 | // a10 a11 a12 a13 a14 | a10 a11 a12 a13 a14 | // a20 a21 a22 a23 a24 | a20 a21 a22 a23 a24 | // ??? ??? ??? ??? ??? | a01 a02 a03 a04 a10 | // ??? ??? ??? ??? ??? | a11 a12 a13 a14 a20 | // ??? ??? ??? ??? ??? | a21 a22 a23 a24 a30 | // ??? ??? ??? ??? ??? | a02 a03 a04 a10 a11 | // ??? ??? ??? ??? ??? | a12 a13 a14 a20 a21 | // ??? ??? ??? ??? ??? | a22 a23 a24 a30 a31 | // vbx_set_vl(temp_vl); for(j = 1; j < filter_height; j++){ for(i = 0; i < filter_width; i++){ vbx(VVWU, VMOV, v_input+(j*filter_height+i)*temp_vl, v_input+i*temp_vl+j, 0); } } //Do the bubble sort up to the filter_size/2^th element on each vbx // work on individual color channels temp_vl_byte = temp_vl*sizeof(vbx_uword_t)/sizeof(vbx_ubyte_t); vbx_set_vl(temp_vl_byte); // sort lower half of the values in the window for(j = 0; j < filter_mid; j++){ v_min = v_input_byte+j*temp_vl_byte; for(i = j+1; i < filter_size; i++){ v_max = v_input_byte+i*temp_vl_byte; vbx(VVBU, VMOV, v_temp, v_min, 0); vbx(VVBU, VSUB, v_sub, v_max, v_min); vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub); vbx(VVBU, VCMV_LTZ, v_max, v_temp, v_sub); } } // grab next smallest value, the median, don't sort the rest v_min = v_input_byte+filter_mid*temp_vl_byte; for(i = filter_mid+1; i < filter_size; i++){ v_max = v_input_byte+i*temp_vl_byte; vbx(VVBU, VSUB, v_sub, v_max, v_min); vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub); } // dma out median value // back to pixels vbx_dma_to_host_2D(output+(l*image_pitch)+k, v_input+temp_vl*filter_mid, temp_vl/rows_per_l*sizeof(vbx_uword_t), rows_per_l, image_pitch*sizeof(vbx_uword_t), image_width*sizeof(vbx_uword_t)); } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }
int vbw_bifilt_argb32_3x3(unsigned *output, unsigned *input, short image_width, const short image_height, const short image_pitch, const short renorm) { //return vbw_sobel_argb32_3x3( output, input, image_width, image_height, image_pitch, renorm); int y; int xx, yy, sharp; vbx_uword_t *v_row_in; vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot; vbx_ubyte_t *v_luma_hii, *v_luma_low; vbx_ubyte_t *v_src[W][W]; vbx_uword_t *v_row_out; vbx_ubyte_t *v00, *v01, *v02, *v10, *v11, *v12, *v20, *v21, *v22; #if W==5 vbx_ubyte_t *v03, *v04, *v13, *v14, *v23, *v24; vbx_ubyte_t *v30, *v31, *v32, *v40, *v41, *v42; vbx_ubyte_t *v33, *v34, *v43, *v44; #endif vbx_ubyte_t *v[W][W]; vbx_uhalf_t *vI, *vW, *vT; // vT== temporary vbx_sp_push(); // Allocate space in scratchpad for vectors struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t), input,input+image_height*image_pitch, image_pitch*sizeof(vbx_uword_t)); v_row_out = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t)); vT = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); #if 1 // save some space by overlapping with v_row_out vW = (vbx_uhalf_t*)v_row_out; vI = (vbx_uhalf_t*)v_row_out + image_width; #else vW = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); vI = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); #endif #if W==3 v_luma_top = (vbx_ubyte_t*)vbx_sp_malloc( 3 * image_width*sizeof(vbx_ubyte_t)); v_luma_mid = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ; v_luma_bot = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ; #else v_luma_top = (vbx_ubyte_t*)vbx_sp_malloc( 5 * image_width*sizeof(vbx_ubyte_t)); v_luma_hii = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ; v_luma_mid = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ; v_luma_low = v_luma_top + 3 * image_width*sizeof(vbx_ubyte_t) ; v_luma_bot = v_luma_top + 4 * image_width*sizeof(vbx_ubyte_t) ; #endif if(v_luma_bot==NULL){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } // Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations #if W==3 rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma(vW, v_row_in, vT, image_width); // 1st luma row vbx( SVHBU, VSHR, v_luma_top, 8, vW ); // convert to byte v_row_in = rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_mid, 8, vW ); // convert to byte #else rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma(vW, v_row_in, vT, image_width); // 1st luma row vbx( SVHBU, VSHR, v_luma_top, 8, vW ); // convert to byte rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_hii, 8, vW ); // convert to byte rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_mid, 8, vW ); // convert to byte rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_low, 8, vW ); // convert to byte #endif // blank out the top and bottom rows unsigned *out; vbx_set_vl(image_width); unsigned COLOUR = ( 200 | (128<<8) | (244<<16) ); vbx(SVWU, VMOV, v_row_out, COLOUR, 0); for( y=0; y<W/2; y++ ) { // Set top output rows to 0 out = output + image_width*y; vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) ); // Set bottom rows to 0 out = output + image_width*(image_height-1-y); vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) ); } // Calculate edges for (y = 0; y < image_height-(W-1); y++) { vbx_set_vl(image_width); // Transfer the next input row while processing rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); // Convert aRGB input to luma vbw_rgb2luma( vW, v_row_in, vT, image_width); vbx( SVHBU, VSHR, v_luma_bot, 8, vW ); // convert to byte vbx_sp_push(); image_width=image_width/2; vbx_set_vl(image_width); v[0][0] = v00 = (vbx_ubyte_t*)vbx_sp_malloc( 25 * image_width*sizeof(vbx_ubyte_t)); v[0][1] = v01 = v00 + 1 * image_width*sizeof(vbx_ubyte_t) ; v[0][2] = v02 = v00 + 2 * image_width*sizeof(vbx_ubyte_t) ; v[1][0] = v10 = v00 + 3 * image_width*sizeof(vbx_ubyte_t) ; v[1][1] = v11 = v00 + 4 * image_width*sizeof(vbx_ubyte_t) ; v[1][2] = v12 = v00 + 5 * image_width*sizeof(vbx_ubyte_t) ; v[2][0] = v20 = v00 + 6 * image_width*sizeof(vbx_ubyte_t) ; v[2][1] = v21 = v00 + 7 * image_width*sizeof(vbx_ubyte_t) ; v[2][2] = v22 = v00 + 8 * image_width*sizeof(vbx_ubyte_t) ; #if W==5 v[0][3] = v03 = v00 + 9 * image_width*sizeof(vbx_ubyte_t) ; v[0][4] = v04 = v00 + 10 * image_width*sizeof(vbx_ubyte_t) ; v[1][3] = v13 = v00 + 11 * image_width*sizeof(vbx_ubyte_t) ; v[1][4] = v14 = v00 + 12 * image_width*sizeof(vbx_ubyte_t) ; v[2][3] = v23 = v00 + 13 * image_width*sizeof(vbx_ubyte_t) ; v[2][4] = v24 = v00 + 14 * image_width*sizeof(vbx_ubyte_t) ; v[3][0] = v30 = v00 + 15 * image_width*sizeof(vbx_ubyte_t) ; v[3][1] = v31 = v00 + 16 * image_width*sizeof(vbx_ubyte_t) ; v[3][2] = v32 = v00 + 17 * image_width*sizeof(vbx_ubyte_t) ; v[3][3] = v33 = v00 + 18 * image_width*sizeof(vbx_ubyte_t) ; v[3][4] = v34 = v00 + 19 * image_width*sizeof(vbx_ubyte_t) ; v[4][0] = v40 = v00 + 20 * image_width*sizeof(vbx_ubyte_t) ; v[4][1] = v41 = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ; v[4][2] = v42 = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ; v[4][3] = v43 = v00 + 23 * image_width*sizeof(vbx_ubyte_t) ; v[4][4] = v44 = v00 + 24 * image_width*sizeof(vbx_ubyte_t) ; #endif if(v00==NULL){ printf("mem alloc failed\n"); fflush(stdout); vbx_sp_pop(); vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } //FIXME -- how to manage row buffers with 5 rows? 3 rows are shown below: #if W==3 for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx; for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_mid+xx; for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_bot+xx; #else for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx; for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_hii+xx; for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_mid+xx; for( xx=0; xx<W; xx++ ) v_src[3][xx] = v_luma_low+xx; for( xx=0; xx<W; xx++ ) v_src[4][xx] = v_luma_bot+xx; #endif vbx_set_vl( image_width - W + 1 ); // compute error (absdiff) in pixel colour with neighbours for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( VVBU, VABSDIFF, v[yy][xx], v_luma_mid+(W/2), v_src[yy][xx] ); } } // v[][] holds the errors (differences) between pixels // efficiently compute a function that looks approximately something like exp(-x): // large value for small errors, small value for big errors for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( SVBU, VABSDIFF, v[yy][xx], 255, v[yy][xx] ); // 255 - img_err // 11 or more iterations is mathematically equivalent to a pure gaussian blur // FIXME is this true? #define NUM_SHARPEN_ITERATIONS 3 // 0 to 10 iterations, practical max is 7 or 8 for( sharp=0; sharp < NUM_SHARPEN_ITERATIONS; sharp++ ) { vbx( VVBU, VMULHI, v[yy][xx], v[yy][xx], v[yy][xx] ); // v*v; } } } // with right decimal place, could do the next two instructions using MULFXP and do as BYTES // convolve errors with gaussian blur kernel for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v[yy][xx] ); } } // sum up the weights for normalization later vbx( VVBHU, VADD, vW, v[0][0], v[0][1] ); vbx( VVBHU, VADD, vT, v[0][2], v[1][0] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[1][1], v[1][2] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[2][0], v[2][1] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[2][2], 0 ); vbx( VVHU, VADD, vW, vW, vT ); #if (W==5) vbx( VVBHU, VADD, vT, v[3][0], v[3][1] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[3][2], v[4][0] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[4][1], v[4][2] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] ); vbx( VVHU, VADD, vW, vW, vT ); #endif // convolve image with new weights for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( VVBU, VMULHI, v[yy][xx], v_src[yy][xx], v[yy][xx] ); //vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v_src[yy][xx] ); //vbx( SVBU, VMUL , v[yy][xx], 1 , v_src[yy][xx] ); } } // sum up the weighted pixels vbx( VVBHU, VADD, vI, v[0][0], v[0][1] ); vbx( VVBHU, VADD, vT, v[0][2], v[1][0] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[1][1], v[1][2] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[2][0], v[2][1] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[2][2], 0 ); vbx( VVHU, VADD, vI, vI, vT ); #if (W==5) vbx( VVBHU, VADD, vT, v[3][0], v[3][1] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[3][2], v[4][0] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[4][1], v[4][2] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] ); vbx( VVHU, VADD, vI, vI, vT ); #endif // keep RHS of image as original grayscale image_width=image_width*2; vbx_set_vl( image_width/2 ); //vbx( VVWHU, VMOV, vT+image_width/2, (v_row_in ) + image_width/2+1, 0 ); vbx( VVBHU, VMOV, vT+image_width/2, (v_src[ 0 ][ 0 ]) + image_width/2+1, 0 ); vbx_sp_pop(); // don't need v[][] data any more // compute LHS of image #if 0 vbx( VVBHU, VMOV, vT, v_src[2][2], 0 ); //vbx( SVHU, VSHR, vI, 3, vI ); //vbx( SVHU, VSHR, vW, 3, vW ); //vbx( VVHU, VMUL, vT, vI, vW ); //vbx( SVHU, VSHR, vT, 8, vT ); #else uint32_t h = image_width/2; vbx( SVHU, VADD, vW, 0x80, vW ); // round vbx( SVHU, VSHR, vW, 8, vW ); vbw_vec_divide_uhalf( vT , vI , vW , h ); //vbw_vec_divide_uhalf( vT+h, vI+h, vW+h, image_width-W+1-h ); #endif // ensure LHS doesn't overflow vbx( SVHU, VAND, vT, 0xff, vT ); // Copy the result to the low byte of the output row // Trick to copy the low byte (b) to the middle two bytes as well // Note that first and last columns are 0 vbx_set_vl(image_width-W+1); vbx(SVHWU, VMULLO, v_row_out+W/2, 0x00010101, vT); // blank out left and right edges // then DMA the result to the output vbx_set_vl(W/2); vbx(SVWU, VMOV, v_row_out, COLOUR, 0 ); vbx(SVWU, VMOV, v_row_out + image_width - (W/2), COLOUR, 0 ); vbx_dma_to_host( output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t) ); // Rotate luma buffers vbx_ubyte_t *tmp_ptr; tmp_ptr = v_luma_top; #if W==3 v_luma_top = v_luma_mid; v_luma_mid = v_luma_bot; v_luma_bot = tmp_ptr; #else v_luma_top = v_luma_hii; v_luma_hii = v_luma_mid; v_luma_mid = v_luma_low; v_luma_low = v_luma_bot; v_luma_bot = tmp_ptr; #endif } vbx_sync(); vbx_sp_pop(); return VBW_SUCCESS; }
void vbw_fix16_div( vbx_word_t* v_result, vbx_word_t* v_a, vbx_word_t* v_b, int length ) { vbx_uword_t* v_product = vbx_sp_malloc(length*sizeof(vbx_word_t)); vbx_uword_t* v_tmp = vbx_sp_malloc(length*sizeof(vbx_word_t)); vbx_uword_t* v_sub = vbx_sp_malloc(length*sizeof(vbx_word_t)); vbx_word_t* v_neg = vbx_sp_malloc(length*sizeof(vbx_word_t)); vbx_uword_t* v_quotient=(vbx_uword_t*)v_result; vbx_uword_t* v_numerator=(vbx_uword_t*)vbx_sp_malloc(length*sizeof(vbx_word_t)); vbx_uword_t* v_denominator=(vbx_uword_t*)vbx_sp_malloc(length*sizeof(vbx_word_t)); //n=abs(a);d=abs(b) vbx(SVW,VABSDIFF,(vbx_word_t*)v_numerator,0,v_a); vbx(SVW,VABSDIFF,(vbx_word_t*)v_denominator,0,v_b); //record if the relsult should be negative //v_neg= msb(v_a ^ v_b); vbx(VVW,VXOR,v_neg,v_a,v_b); vbx(SVW,VAND,v_neg,1<<31,v_neg); //successive approximation until v_d * v_q == v_n uint32_t bit=1<<31; /* v_q = bit; */ /* while(bit){ */ /* if(v_q*v_d > v_n){ <--watch for overflow here*/ /* v_q ^=bit; */ /* } */ /* bit>>=1; */ /* v_q|=bit; */ /* } */ vbx(SVWU,VMOV,v_quotient,bit,0); while(bit){ vbx(VVWU,VMULFXP,v_product,v_denominator,v_quotient); vbx(SVWU,VXOR,v_tmp,bit,v_quotient); vbx(VVWU,VCMV_FS,v_quotient,v_tmp,v_product); //clear the bit if overflow vbx(VVWU,VSUB,v_sub,v_product,v_numerator); vbx(VVWU,VCMV_GTZ,v_quotient,v_tmp,v_sub); //clear the bit if gt bit>>=1; vbx(SVWU,VOR,v_quotient,bit,v_quotient); } //correct the sign if necessary vbx(SVWU,VMUL,v_tmp,-1,(vbx_uword_t*)v_result); vbx(VVWU,VCMV_NZ,(vbx_uword_t*)v_result,v_tmp,(vbx_uword_t*)v_neg); }
//vector version of rgb converter void vector_blend( output_pointer img_out, input_pointer img_in1, input_pointer img_in2, unsigned int num_row, unsigned int num_column, intermediate_type blending_const ) { intermediate_type *v_img1[2]; input_type *v_img2[2]; intermediate_type *v_temp; intermediate_type blending_const_bar = 256-blending_const; int j; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int VBX_WIDTH_BYTES = this_mxp->vector_lanes * sizeof(int); const int VBX_DMA_ALIGNMENT = this_mxp->dma_alignment_bytes; unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type))); chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT ); unsigned int chunk_size_old = chunk_size; unsigned int vector_length = chunk_size; unsigned int vector_length_old = vector_length; v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img2[0] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_img2[1] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_temp = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); if( v_temp == NULL ) { VBX_EXIT(0xBADDEAD); } int bufselect = 0; vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) ); for( j=0; j<num_row*num_column; j+=vector_length_old ) { vbx_set_vl(vector_length); if( j > 0 ) { vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) ); } if( (j+vector_length_old) < (num_row*num_column-1) ) { if( (j+vector_length_old*2) >= num_row*num_column ) { vector_length = num_row*num_column - j - vector_length_old; chunk_size = vector_length; } vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) ); } vbx( SVBHU, VMULLO, v_temp, blending_const, v_img1[bufselect] ); vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] ); vbx( VVHU, VADD, v_img1[bufselect], v_img1[bufselect], v_temp ); vbx( SVHBU, VSHR, v_img1[bufselect], 8, v_img1[bufselect] ); bufselect = 1-bufselect; } vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) ); vbx_sp_free(); vbx_sync(); }
void vbx_mtx_fdct( vbx_mtx_fdct_t *v, dt *block_v, dt *image, int start_x, int start_y, int end_x, int end_y,int num_tile_x, int num_tile_y ) { // vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); // const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int BIG_TILE_SIZE = num_tile_x * num_tile_y * DCT_SIZE; int next_x=start_x+1; int next_y=start_y; int get_next=1; if( start_x == end_x && start_y == end_y ) { get_next=0; } if( start_x == end_x ) { next_x = 0; next_y++; } const vbx_half_t *vimageDMA = v->vimage[!v->db]; // dma // const vbx_half_t *vblockDMA = v->vblock[!v->db]; // dma // never used directly const vbx_half_t *vimageVPU = v->vimage[ v->db]; // active const vbx_half_t *vblockVPU = v->vblock[ v->db]; // active const vbx_half_t *vblockTMP = v->vblock[ 2 ]; // temp const vbx_half_t *vcoeff = v->vcoeff; const vbx_half_t *vprods = v->vprods; const vbx_half_t *vaccum = v->vaccum; const vbx_half_t *vflags = v->vflags; #if DMA // First, prefetch the next chunk of the next image for a future call to fdct_tile() #if NUM_TILE_Y > 1 if( get_next ) // get row 0 getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 ); #else if( get_next ) // get row 0 getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 ); #endif #endif int r; for( r=0; r < BLOCK_SIZE; r++ ) { // perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching vbx_set_vl( NUM_TILE_X * BLOCK_SIZE ); // for the length of tiled rows vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0, NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows of tiled coeffiencents vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt), NUM_TILE_X * DCT_SIZE*sizeof(dt), 0 ); // for all groups Y vbx_3D( VVH, VMUL, vprods, vimageVPU + r*NUM_TILE_X*BLOCK_SIZE, vcoeff); // for all 'columns' of tiled data #if ACCUMULATE // accumulate the multiply operations #if 0 & USE_ACCUM_FLAGS vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods+0, vprods+1 ); vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) ); vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags ); #elif BLOCK4 //case DCT 4 vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods, vprods+1 ); vbx( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VADD, vaccum, vaccum, vprods+3 ); vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags ); #else //correct? vbx_set_vl( BLOCK_SIZE ); vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); vbx_set_3D( NUM_TILE_X, BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt) ); #if NUM_TILE_Y == 1 vbx_acc_3D( VVH, VOR, vblockTMP + r, vprods , vprods ); #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_acc_3D( VVH, VOR, vblockTMP + r + y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE ); } #endif #endif #endif #if 0 // dont do DMA READS here yet. a DMA WRITE may still be in progress, give it chance to finish #if DMA // every other iteration, prefetch the next row of the next image // NB: with 2D DMA, we could issue this as a single DMA request at the top of the file // instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue #if NUM_TILE_Y > 1 if( !(r&1) && get_next ) getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added #else if( !(r&1) && get_next ) getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added #endif #endif #endif } vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE ); vbx( SVH, VSHR, vblockTMP, SHIFT_AMOUNT, vblockTMP ); // now do the transposed version for( r=0; r < BLOCK_SIZE; r++ ) { // perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching vbx_set_vl( NUM_TILE_X * BLOCK_SIZE ); // for the length of tiled rows vbx_set_2D( BLOCK_SIZE, NUM_TILE_X * BLOCK_SIZE*sizeof(dt), NUM_TILE_X * BLOCK_SIZE*sizeof(dt), 0 ); // for all 'columns' of tiled data vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt), NUM_TILE_X * DCT_SIZE*sizeof(dt), 0 ); // for all groups Y vbx_3D( VVH, VMUL, vprods, vblockTMP, vcoeff + r*NUM_TILE_X*BLOCK_SIZE); // for all rows of tiled coeffients #if ACCUMULATE // accumulate the multiply operations #if 0 & USE_ACCUM_FLAGS vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods+0, vprods+1 ); vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) ); vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags ); #elif BLOCK4 //case DCT 4 vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( VVH, VADD, vaccum, vprods, vprods+1 ); vbx( VVH, VADD, vaccum, vaccum, vprods+2 ); vbx( VVH, VADD, vaccum, vaccum, vprods+3 ); //vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags ); vbx_set_vl( NUM_TILE_X * BLOCK_SIZE - (BLOCK_SIZE-1) ); // for the length of a tiled row vbx_set_2D( BLOCK_SIZE, 1*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0);// for all tiled rows #if NUM_TILE_Y == 1 vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE, vaccum, vflags ); // #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE , vaccum+y*NUM_TILE_X*DCT_SIZE, vflags ); // } #endif #else //correct? vbx_set_vl( BLOCK_SIZE ); // for the length of a row vbx_set_2D( BLOCK_SIZE, sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows in that block vbx_set_3D( NUM_TILE_X, BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt), BLOCK_SIZE*sizeof(dt) ); // for all tiled blocks horizontally(x) #if NUM_TILE_Y == 1 vbx_acc_3D( VVH, VOR, vblockVPU + r*NUM_TILE_X*BLOCK_SIZE , vprods , vprods ); #else int y; for (y=0; y< NUM_TILE_Y; y++){ vbx_acc_3D( VVH, VOR, vblockVPU + r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE, vprods+ y*NUM_TILE_X*DCT_SIZE ); } #endif #endif #endif #if DMA // every other iteration, prefetch the next row of the next image // NB: with 2D DMA, we could issue this as a single DMA request at the top of the file // instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue #if NUM_TILE_Y > 1 //if( !(r&1) && r<(BLOCK_SIZE-1) && get_next ) if( get_next ) getBigTileImageY( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, r ); //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows #else //if( !(r&1) && r<(BLOCK_SIZE-1) && get_next ) if( get_next ) getBigTileImage( vimageDMA, image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, r ); //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows #endif #endif } vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE ); vbx( SVH, VSHR, vblockVPU, SHIFT_AMOUNT, vblockVPU ); #if DMA2 // Write result back to memory as one big block vbx_dma_to_host( block_v, vblockVPU, BIG_TILE_SIZE*sizeof(dt) ); #endif v->db = !v->db; #ifdef DEBUG { vbx_sync(); int i,j; printf("%d\n", !db); for(i=0;i<BLOCK_SIZE*NUM_TILE_Y;i++){ for(j=0;j<BLOCK_SIZE*NUM_TILE_X;j++){ printf(" %4d", block_v[i*BLOCK_SIZE*NUM_TILE_X+j]); } printf("\n"); } } #endif }
int compare_vbx_lut_to_vbx_lut_ci(int stage, int max_print_errors) { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int vci_lanes = this_mxp->vcustom0_lanes; int sz = this_mxp->scratchpad_size/(16*sizeof(vbx_ubyte_t)); vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); if(v_idx == NULL) { printf("failed to allocate in compare_vbx_lut_to_vbx_lut_ci\n"); } unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); int f, n, s, errors = 0; for (n = 0; n < sz; n++) { v_pattern[n] = (n & 0xff); } for (f = 0; f < face_lbp[stage].count; f++) { lbp_feat_t feat = face_lbp[stage].feats[f]; vbx_set_vl(sz); int total = f; s = 0; while(s < stage){ total += face_lbp[s].count; s++; } if(total < 256) { vbx(SVBU, VLBPLUT, v_lutc, total, v_pattern); } else { vbx(SVBS, VLBPLUT, v_lutc, total-256, v_pattern); } vbx(SVB, VMOV, v_pass, feat.fail, 0); /* check if pattern is in lut */ vbx(SVBU, VSHR, v_group, 5, v_pattern); for (n = 0; n < 8; n++) { vbx(SVB, VADD, v_sel, -n, v_group); vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel); } vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern); vbx(VVWB, VSHR, v_lut, v_idx, v_lut); vbx(SVB, VAND, v_lut, 1, v_lut); vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut); vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char)); vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char)); vbx_sync(); errors += match_array_byte(lut, lut_c, "custom_lut", sz, 1, 0, max_print_errors, 0, 0); } vbx_sp_free(); vbx_shared_free(lut); vbx_shared_free(lut_c); return errors; }
/* takes in precomputed bfly */ static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real) { int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num; short wr, wi; vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp; vptr_half v_twr, v_twi; vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip; vptr_half v_temp; vptr_half v_twr2, v_twi2; const int n = 1 << m; const int half = n >> 1; scale = 0; mul = 0; swap = m >> 1; l = m-1; flight = 1; bfly = half; const int INROWS = 1<<swap; const int INCOLS = 1<<(m-swap); if ( !(m%2) ){ swap--; } // allocate space in vector memory for vectors v_fr = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_twr = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_temp = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); if( v_fr == NULL || v_fi == NULL || v_fr2 == NULL || v_fi2== NULL || \ v_twr == NULL || v_twi == NULL || v_temp == NULL) { VBX_EXIT(-1); } v_twr2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); if( v_twr2 == NULL || v_twi2 == NULL) { VBX_EXIT(-1); } vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) ); #if 1 if(real){ vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse); } #endif while (l > swap) { if (inverse) { // variable scaling, depending upon data shift = 0; if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) { shift = 1; scale++; } } else { // fixed scaling, for proper normalization // -- overall factor of 1/n, distributed to maximize arithmetic accuracy shift = 1; } // shift will be performed on each data point exactly once during pass SWAP( v_fr, v_fr2, v_tmp ); SWAP( v_fi, v_fi2, v_tmp ); if (shift){ vbx_set_vl( n ); vbx(SVH,VSHR, v_fr2, 1, v_fr2 ); vbx(SVH,VSHR, v_fi2, 1, v_fi2 ); } vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fr[n>>1], v_fr2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fi2, v_twi ); vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp ); vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fi[n>>1], v_fi2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fr2, v_twi ); vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VADD, &v_fi[n>>1], &v_fi[n>>1], v_temp ); l--; mul++; flight <<= 1; if( l > swap ) { vbx_set_vl( 1<<l ); vbx( VVWH, VMOV, v_twr, v_twr, 0 ); vbx( VVWH, VMOV, v_twi, v_twi, 0 ); } } if ( !(m%2) ) { l++; flight >>=1; }
void vbw_fix16_sqrt( vbx_word_t* v_out, vbx_word_t* v_x, int length) { vbx_sp_push(); //vbx_word_t* v_tmp = (vbx_word_t *)vbx_sp_malloc(sizeof(vbx_word_t)*length*11); vbx_word_t* v_tmp = (vbx_word_t *)vbx_sp_malloc(sizeof(vbx_word_t)*length*10); vbx_word_t* v_result = v_tmp + 0*length; vbx_uword_t* v_bit = (vbx_uword_t*)v_tmp + 1*length; vbx_word_t* v_num = v_tmp + 2*length; vbx_uword_t* v_else_num = (vbx_uword_t*)v_tmp + 3*length; vbx_uword_t* v_t_bit = (vbx_uword_t*)v_tmp + 4*length; vbx_uword_t* v_t_num = (vbx_uword_t*)v_tmp + 5*length; vbx_uword_t* v_t_add = (vbx_uword_t*)v_tmp + 6*length; vbx_word_t* v_t_sub = v_tmp + 7*length; vbx_uword_t* v_t_result = (vbx_uword_t*)v_tmp + 8*length; vbx_uword_t* v_if_num = (vbx_uword_t*)v_tmp + 9*length; //vbx_word_t* v_neg = v_tmp + 10*length; v_result = v_out; //uint8_t neg = (inValue < 0); //vbx(SVW, VMOV, v_neg, 0, 0 ); //vbx(SVW, VCMV_LTZ, v_neg, 1, v_x); //uint32_t num = (neg ? -inValue : inValue); vbx(SVW, VABSDIFF, v_num, 0, v_x); //uint32_t result = 0; vbx(SVW, VMOV, v_result, 0, 0 ); //uint32_t bit; vbx(SVWU, VMOV, v_bit, (1<<30), 0 ); //* // Many numbers will be less than 15, so // this gives a good balance between time spent // in if vs. time spent in the while loop // when searching for the starting value. /* if (num & 0xFFF00000) bit = (uint32_t)1 << 30; else bit = (uint32_t)1 << 18; */ // while (bit > num) bit >>= 2; int i, max_iter; max_iter = 16; //1<<30 and >>2 every iter, so max iter = 30/2 + 1 for(i=0; i<max_iter; i++){ vbx(VVW, VSUB, v_t_sub, (vbx_word_t*)v_bit, v_num); vbx(SVWU, VSHR, v_t_bit, 2, v_bit); vbx(VVW, VCMV_GTZ, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_bit, v_t_sub); } // The main part is executed twice, in order to avoid // using 64 bit values in computations. /* while (bit) { if (num >= result + bit) { num -= result + bit; result = (result >> 1) + bit; } else { result = (result >> 1); } bit >>= 2; } */ max_iter = 16; for(i=0; i<max_iter; i++){ //v_result + bit vbx(VVW, VADD, (vbx_word_t*)v_t_add, (vbx_word_t*)v_bit, v_result); //v_num - (v_result + bit) vbx(VVW, VSUB, v_t_sub, v_num, (vbx_word_t*)v_t_add); //if (v_num - (v_result + bit) >= 0) v_num = v_num - (v_result + bit) vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_num, v_t_sub, v_t_sub); //else v_num stays vbx(VVW, VCMV_LTZ, (vbx_word_t*)v_t_num, v_num, v_t_sub); vbx(SVW, VSHR, (vbx_word_t*)v_t_result, 1, v_result); vbx(VVW, VADD, (vbx_word_t*)v_t_add, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_result); //if (v_num - (v_result + bit) >= 0) v_result = v_result >> 1 + bit //else v_result >> 1 vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_result, (vbx_word_t*)v_t_add, v_t_sub); vbx(SVW, VSHR, (vbx_word_t*)v_t_bit, 2, (vbx_word_t*)v_bit); vbx(VVW, VCMV_GTZ, v_num, (vbx_word_t*)v_t_num, (vbx_word_t*)v_bit); vbx(VVW, VCMV_GTZ, v_result, (vbx_word_t*)v_t_result, (vbx_word_t*)v_bit); vbx(VVW, VCMV_GTZ, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_bit, (vbx_word_t*)v_bit); } //vbx(SVW, VSHL, v_result, 8, v_result); //#if 0 /* if (num > 65535) { // The remainder 'num' is too large to be shifted left // by 16, so we have to add 1 to result manually and // adjust 'num' accordingly. // num = a - (result + 0.5)^2 // = num + result^2 - (result + 0.5)^2 // = num - result - 0.5 num -= result; num = (num << 16) - 0x8000; result = (result << 16) + 0x8000; } else { num <<= 16; result <<= 16; } bit = 1 << 14; */ vbx(SVW, VSUB, v_t_sub, 65535, v_num); vbx(VVWU, VSUB, v_if_num, (vbx_uword_t*)v_num, (vbx_uword_t*)v_result); vbx(SVWU, VSHL, v_if_num, 16, v_if_num); vbx(SVWU, VADD, v_if_num, (-1*(0x8000)), v_if_num); vbx(SVWU, VSHL, v_t_result, 16, (vbx_uword_t*)v_result); vbx(SVWU, VADD, v_t_add, (0x8000), v_t_result); vbx(SVWU, VSHL, v_else_num, 16, (vbx_uword_t*)v_num); vbx(VVWU, VCMV_LTZ, (vbx_uword_t*)v_num, v_if_num, (vbx_uword_t*)v_t_sub); vbx(VVWU, VCMV_GEZ, (vbx_uword_t*)v_num, v_else_num, (vbx_uword_t*)v_t_sub); vbx(VVWU, VCMV_LTZ, (vbx_uword_t*)v_result, v_t_add, (vbx_uword_t*)v_t_sub); vbx(VVWU, VCMV_GEZ, (vbx_uword_t*)v_result, v_t_result, (vbx_uword_t*)v_t_sub); vbx(SVWU, VMOV, v_bit, (1<<14), 0); max_iter = 8; //1<<14 and >>2 every iter, so 14/2 + 1 for(i=0; i<max_iter; i++){ vbx(VVWU, VADD, v_t_add, v_bit, (vbx_uword_t*)v_result); vbx(VVWU, VSUB, (vbx_uword_t*)v_t_sub, (vbx_uword_t*)v_num, v_t_add); vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_num, v_t_sub, v_t_sub); vbx(VVW, VCMV_LTZ, (vbx_word_t*)v_t_num, v_num, v_t_sub); vbx(SVWU, VSHR, v_t_result, 1, (vbx_uword_t*)v_result); vbx(VVWU, VADD, v_t_add, v_bit, v_t_result); vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_result, (vbx_word_t*)v_t_add, v_t_sub); vbx(SVWU, VSHR, v_t_bit, 2, v_bit); vbx(VVWU, VCMV_NZ, (vbx_uword_t*)v_num, v_t_num, v_bit); vbx(VVWU, VCMV_NZ, (vbx_uword_t*)v_result, v_t_result, v_bit); vbx(VVWU, VCMV_NZ, v_bit, v_t_bit, v_bit); } #ifndef FIXMATH_NO_ROUNDING /* // Finally, if next bit would have been 1, round the result upwards. if (num > result) { result++; } */ vbx(VVW, VSUB, v_t_sub, v_num, v_result); vbx(SVW, VADD, (vbx_word_t*)v_t_result, 1, v_result); vbx(VVW, VCMV_GTZ, v_result, (vbx_word_t*)v_t_result, v_t_sub); #endif /* return (neg ? -result : result); */ vbx(SVW, VSUB, (vbx_word_t*)v_t_result, 0, v_result); vbx(VVW, VCMV_LTZ, v_result, (vbx_word_t*)v_t_result, v_x); vbx_sp_pop(); }
/** Luma Edge Detection. * * @brief 3x3 Sobel edge detection with 8-bit luma image * * @param[out] output 32-bit aRGB edge-intensity output * @param[in] input 8-bit luma input * @param[in] image_width Image width in pixels * @param[in] image_height Image height in pixels * @param[in] image_pitch Distance in pixels between the start of subsequent rows. usually equal to image_width * @param[in] renorm Number of bits to shift the final intensity by to the right * @returns Negative on error condition. See vbw_exit_codes.h */ int vbw_sobel_luma8_3x3(unsigned *output, unsigned char *input, const short image_width, const short image_height, const short image_pitch, const short renorm) { int y; vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot; vbx_uword_t *v_row_out; vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot; vbx_uhalf_t *v_gradient_x, *v_gradient_y; vbx_uhalf_t *v_tmp; void *tmp_ptr; vbx_sp_push(); // Allocate space in scratchpad for vectors rotating_prefetcher_t v_luma=rotating_prefetcher(3,image_width*sizeof(vbx_ubyte_t), input,input+image_height*image_pitch, image_pitch*sizeof(vbx_ubyte_t)); v_sobel_row_top = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_mid = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_bot = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_gradient_x = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_gradient_y = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_row_out = (vbx_uword_t *)vbx_sp_malloc(image_width*sizeof(vbx_uword_t)); if(v_row_out==NULL) { vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } // Transfer the first 3 input rows and interleave first 2 sobel row calculations rp_fetch(&v_luma); rp_fetch(&v_luma); v_luma_top=rp_get_buffer(&v_luma, 0); vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top,image_width); rp_fetch(&v_luma); v_luma_mid=rp_get_buffer(&v_luma, 1); vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width); // Set top output row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t)); // Calculate edges for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) { // Transfer the next input row while processing rp_fetch(&v_luma); v_luma_top=rp_get_buffer(&v_luma,0); v_luma_mid=rp_get_buffer(&v_luma,1); v_luma_bot=rp_get_buffer(&v_luma,2); // Start calculating gradient_x vbx_set_vl(image_width); vbx(SVBHU, VSHL, v_gradient_x, 1, v_luma_mid); // multiply by 2 // Calculate gradient_y // Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot); // Re-use v_sobel_row_top v_tmp = v_sobel_row_top; // Finish calculating gradient_x // Apply [1 2 1]T matrix to all columns vbx_set_vl(image_width); vbx(VVBHU, VADD, v_tmp, v_luma_top, v_luma_bot); vbx(VVHU, VADD, v_tmp, v_tmp, v_gradient_x); // For each column, calculate absolute difference with 2nd column to the right vbx_set_vl(image_width-2); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2); // sum of absoute gradients //vbx_set_vl(image_width-2); vbx(VVHU, VADD, v_tmp, v_gradient_x, v_gradient_y); vbx(SVHU, VSHR, v_tmp, renorm, v_tmp); // Threshold vbx(SVHU, VSUB, v_gradient_y, 255, v_tmp); vbx(SVHU, VCMV_LTZ, v_tmp, 255, v_gradient_y); // Copy the result to the low byte of the output row // Trick to copy the low byte (b) to the middle two bytes as well // Note that first and last columns are 0 //vbx_set_vl(image_width-2); vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp); // DMA the result to the output vbx_dma_to_host(output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t)); // Rotate v_sobel_row buffers (for gradient_y) tmp_ptr = (void *)v_sobel_row_top; v_sobel_row_top = v_sobel_row_mid; v_sobel_row_mid = v_sobel_row_bot; v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr; } // Set bottom row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t)); vbx_sync(); vbx_sp_pop(); return VBW_SUCCESS; }