inline int vec_fir_ext(vbx_mm_t *output, vbx_mm_t *input, vbx_mm_t *coeffs, int sample_size, int num_taps) { vbx_sp_push(); int ret = vec_fir_tiler<vbx_mm_t,false>(output,input,coeffs,sample_size,num_taps); vbx_sp_pop(); return ret; }
double test_vector_sp(vbx_mm_t *vector_out, vbx_mm_t *vector_in1, int IN1ROWS, int IN1COLS, vbx_mm_t *vector_in2, int IN2ROWS, int IN2COLS, double scalar_time ) { typedef vbx_mm_t vbx_sp_t; int retval=-1; vbx_timestamp_t time_start, time_stop; printf( "\nExecuting MXP matrix multiply... src1[%dx%d] src2[%dx%d]\n",IN1ROWS, IN1COLS,IN2ROWS, IN2COLS ); vbx_timestamp_start(); time_start = vbx_timestamp(); vbx_sp_push(); vbx_sp_t* v_in1=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN1COLS); vbx_sp_t* v_in2=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN2ROWS*IN2COLS); vbx_sp_t* v_out=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN2COLS); if(v_out!=NULL){ vbx_dma_to_vector(v_in1,vector_in1,sizeof(vbx_sp_t)*IN1ROWS*IN1COLS); vbx_dma_to_vector(v_in2,vector_in2,sizeof(vbx_sp_t)*IN2ROWS*IN2COLS); retval = vbw_mtx_mul( v_out, v_in1, IN1ROWS, IN1COLS, v_in2, IN2ROWS, IN2COLS ); vbx_dma_to_host(vector_out,v_out,sizeof(vbx_sp_t)*IN1ROWS*IN2COLS); vbx_sync(); }else{ printf("not enough sp space for sp test"); } time_stop = vbx_timestamp(); printf( "...done. retval:0x%08X\n", retval ); return vbx_print_vector_time( time_start, time_stop, scalar_time ); }
int deep_vector_copy_test() { int retval; int num_test; int total_errors = 0; const int NUM_TESTS = TEST_DEEP_SP_NUM_TESTS; const int NB = vbx_sp_getfree(); int NT = NB / sizeof(vbx_sp_t); vbx_sp_push(); vbx_sp_t *v = vbx_sp_malloc( NB ); srand( 0x1a84c92a ); for( num_test=0; num_test < NUM_TESTS ; num_test++ ) { // initialize entire available scratchpad vbx_set_vl( NT ); vbx( SE(T), VAND, v, MSK, 0 ); // choose random src/dest/length: // -- randomly pick the dest // -- set a window size of 2*K around the dest // -- randomly pick the src within the window // -- randomly pick the length, subject to end-of-scratchpad // -- this 'window' rule increases probability of overlaps // -- rough distribution: 30% short (pipeline) overlaps, 20% long overlaps, 50% no overlap int K, N1, N2, NN; N1 = rand() % NT; K = 1 + rand() % ((N1 > 0)? min(min(N1, NT-N1), 1024): min(NT, 1024)); N2 = N1 - K + rand() % (2*K); NN = rand() % (NT - max(N1,N2)); vbx_sp_t *dst = v + N1; vbx_sp_t *src = v + N2; printf("test:%d src:0x%08x dst:0x%08x len:%08d", num_test, N1, N2, NN ); // do the copy retval = VBX_T(vbw_vec_copy)( dst, src, NN ); vbx_sync(); printf(" retval:0x%04x\n",retval); // ensure the copy was done properly int errors = verify_copy((vbx_mm_t *)v, 0, N1, 0, "head") + verify_copy((vbx_mm_t *)v, N1, NN+N1, (N2-N1), "copy") + verify_copy((vbx_mm_t *)v, NN+N1, NT, 0, "tail"); total_errors += errors; if( errors ) { //break; } } vbx_sp_pop(); return total_errors; }
vbx_mtx_fdct_t * vbx_mtx_fdct_init( dt *coeff_v, dt *image ) { const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE; const int num_bytes = BIG_TILE_SIZE * sizeof(dt); const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt); //compute coeffs matrix in double and truncated to dt int i, j; double s; for (i = 0; i < BLOCK_SIZE; i++) { s = (i == 0) ? sqrt(0.125) : 0.5; for (j = 0; j < BLOCK_SIZE; j++) { c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5)); cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999); } } vbx_sp_push(); vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) ); v->vcoeff = (vbx_half_t *)vbx_sp_malloc( co_bytes ); v->vprods = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #if USE_ACCUM_FLAGS v->vaccum = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vflags = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #endif // interleave ordering to ensure no false hazards v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); if( !v->vblock[1] ) { VBX_PRINTF( "ERROR: out of memory.\n" ); VBX_EXIT(-1); } vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes ); int row; for( row=0; row < BLOCK_SIZE; row++ ) { getBigTileImageY(v->vimage[v->db],image,row); } #if USE_ACCUM_FLAGS // create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( SEH, VAND, v->vflags, BLOCK_SIZE-1, 0 ); #endif return v; }
int vbw_sobel_argb32_3x3(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm) { size_t free_sp=vbx_sp_getfree(); size_t vectors_needed=8; size_t partial_width=free_sp/(vectors_needed*sizeof(vbx_uword_t)); if(partial_width>image_width){ vbw_sobel_argb32_3x3_partial(output, input, image_width, image_height, image_pitch,renorm); }else{ //can do entire row at a time, so do partial_width at a time size_t partial_step=partial_width-2; int i; for(i=0;;i+=partial_step){ //account for last tile being smaller if(i+partial_width > image_width){ partial_width=image_width-i; } vbw_sobel_argb32_3x3_partial(output+i, input+i, partial_width, image_height, image_pitch,renorm); if(i+partial_width == image_width){ //that was the last tile, so break, //I don't believe that this can be in the for statement break; } } } vbx_sp_push(); vbx_word_t* side=vbx_sp_malloc(sizeof(vbx_word_t)); vbx_set_vl(1); vbx(SVW,VMOV,side,0,0); vbx_dma_to_host_2D(output,/*host_ptr*/ side,/*sp_ptr*/ sizeof(vbx_word_t),/*row len*/ image_height,/*num rows*/ image_pitch*sizeof(vbx_word_t),/*host_incr*/ 0);/*sp incr*/ vbx_dma_to_host_2D(output+image_width-1,/*host_ptr*/ side,/*sp_ptr*/ sizeof(vbx_word_t),/*row len*/ image_height,/*num rows*/ image_pitch*sizeof(vbx_word_t),/*host_incr*/ 0);/*sp incr*/ vbx_sp_pop(); vbx_sync(); }
void vbw_fix16_sqrt( vbx_word_t* v_out, vbx_word_t* v_x, int length) { vbx_sp_push(); //vbx_word_t* v_tmp = (vbx_word_t *)vbx_sp_malloc(sizeof(vbx_word_t)*length*11); vbx_word_t* v_tmp = (vbx_word_t *)vbx_sp_malloc(sizeof(vbx_word_t)*length*10); vbx_word_t* v_result = v_tmp + 0*length; vbx_uword_t* v_bit = (vbx_uword_t*)v_tmp + 1*length; vbx_word_t* v_num = v_tmp + 2*length; vbx_uword_t* v_else_num = (vbx_uword_t*)v_tmp + 3*length; vbx_uword_t* v_t_bit = (vbx_uword_t*)v_tmp + 4*length; vbx_uword_t* v_t_num = (vbx_uword_t*)v_tmp + 5*length; vbx_uword_t* v_t_add = (vbx_uword_t*)v_tmp + 6*length; vbx_word_t* v_t_sub = v_tmp + 7*length; vbx_uword_t* v_t_result = (vbx_uword_t*)v_tmp + 8*length; vbx_uword_t* v_if_num = (vbx_uword_t*)v_tmp + 9*length; //vbx_word_t* v_neg = v_tmp + 10*length; v_result = v_out; //uint8_t neg = (inValue < 0); //vbx(SVW, VMOV, v_neg, 0, 0 ); //vbx(SVW, VCMV_LTZ, v_neg, 1, v_x); //uint32_t num = (neg ? -inValue : inValue); vbx(SVW, VABSDIFF, v_num, 0, v_x); //uint32_t result = 0; vbx(SVW, VMOV, v_result, 0, 0 ); //uint32_t bit; vbx(SVWU, VMOV, v_bit, (1<<30), 0 ); //* // Many numbers will be less than 15, so // this gives a good balance between time spent // in if vs. time spent in the while loop // when searching for the starting value. /* if (num & 0xFFF00000) bit = (uint32_t)1 << 30; else bit = (uint32_t)1 << 18; */ // while (bit > num) bit >>= 2; int i, max_iter; max_iter = 16; //1<<30 and >>2 every iter, so max iter = 30/2 + 1 for(i=0; i<max_iter; i++){ vbx(VVW, VSUB, v_t_sub, (vbx_word_t*)v_bit, v_num); vbx(SVWU, VSHR, v_t_bit, 2, v_bit); vbx(VVW, VCMV_GTZ, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_bit, v_t_sub); } // The main part is executed twice, in order to avoid // using 64 bit values in computations. /* while (bit) { if (num >= result + bit) { num -= result + bit; result = (result >> 1) + bit; } else { result = (result >> 1); } bit >>= 2; } */ max_iter = 16; for(i=0; i<max_iter; i++){ //v_result + bit vbx(VVW, VADD, (vbx_word_t*)v_t_add, (vbx_word_t*)v_bit, v_result); //v_num - (v_result + bit) vbx(VVW, VSUB, v_t_sub, v_num, (vbx_word_t*)v_t_add); //if (v_num - (v_result + bit) >= 0) v_num = v_num - (v_result + bit) vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_num, v_t_sub, v_t_sub); //else v_num stays vbx(VVW, VCMV_LTZ, (vbx_word_t*)v_t_num, v_num, v_t_sub); vbx(SVW, VSHR, (vbx_word_t*)v_t_result, 1, v_result); vbx(VVW, VADD, (vbx_word_t*)v_t_add, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_result); //if (v_num - (v_result + bit) >= 0) v_result = v_result >> 1 + bit //else v_result >> 1 vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_result, (vbx_word_t*)v_t_add, v_t_sub); vbx(SVW, VSHR, (vbx_word_t*)v_t_bit, 2, (vbx_word_t*)v_bit); vbx(VVW, VCMV_GTZ, v_num, (vbx_word_t*)v_t_num, (vbx_word_t*)v_bit); vbx(VVW, VCMV_GTZ, v_result, (vbx_word_t*)v_t_result, (vbx_word_t*)v_bit); vbx(VVW, VCMV_GTZ, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_bit, (vbx_word_t*)v_bit); } //vbx(SVW, VSHL, v_result, 8, v_result); //#if 0 /* if (num > 65535) { // The remainder 'num' is too large to be shifted left // by 16, so we have to add 1 to result manually and // adjust 'num' accordingly. // num = a - (result + 0.5)^2 // = num + result^2 - (result + 0.5)^2 // = num - result - 0.5 num -= result; num = (num << 16) - 0x8000; result = (result << 16) + 0x8000; } else { num <<= 16; result <<= 16; } bit = 1 << 14; */ vbx(SVW, VSUB, v_t_sub, 65535, v_num); vbx(VVWU, VSUB, v_if_num, (vbx_uword_t*)v_num, (vbx_uword_t*)v_result); vbx(SVWU, VSHL, v_if_num, 16, v_if_num); vbx(SVWU, VADD, v_if_num, (-1*(0x8000)), v_if_num); vbx(SVWU, VSHL, v_t_result, 16, (vbx_uword_t*)v_result); vbx(SVWU, VADD, v_t_add, (0x8000), v_t_result); vbx(SVWU, VSHL, v_else_num, 16, (vbx_uword_t*)v_num); vbx(VVWU, VCMV_LTZ, (vbx_uword_t*)v_num, v_if_num, (vbx_uword_t*)v_t_sub); vbx(VVWU, VCMV_GEZ, (vbx_uword_t*)v_num, v_else_num, (vbx_uword_t*)v_t_sub); vbx(VVWU, VCMV_LTZ, (vbx_uword_t*)v_result, v_t_add, (vbx_uword_t*)v_t_sub); vbx(VVWU, VCMV_GEZ, (vbx_uword_t*)v_result, v_t_result, (vbx_uword_t*)v_t_sub); vbx(SVWU, VMOV, v_bit, (1<<14), 0); max_iter = 8; //1<<14 and >>2 every iter, so 14/2 + 1 for(i=0; i<max_iter; i++){ vbx(VVWU, VADD, v_t_add, v_bit, (vbx_uword_t*)v_result); vbx(VVWU, VSUB, (vbx_uword_t*)v_t_sub, (vbx_uword_t*)v_num, v_t_add); vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_num, v_t_sub, v_t_sub); vbx(VVW, VCMV_LTZ, (vbx_word_t*)v_t_num, v_num, v_t_sub); vbx(SVWU, VSHR, v_t_result, 1, (vbx_uword_t*)v_result); vbx(VVWU, VADD, v_t_add, v_bit, v_t_result); vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_result, (vbx_word_t*)v_t_add, v_t_sub); vbx(SVWU, VSHR, v_t_bit, 2, v_bit); vbx(VVWU, VCMV_NZ, (vbx_uword_t*)v_num, v_t_num, v_bit); vbx(VVWU, VCMV_NZ, (vbx_uword_t*)v_result, v_t_result, v_bit); vbx(VVWU, VCMV_NZ, v_bit, v_t_bit, v_bit); } #ifndef FIXMATH_NO_ROUNDING /* // Finally, if next bit would have been 1, round the result upwards. if (num > result) { result++; } */ vbx(VVW, VSUB, v_t_sub, v_num, v_result); vbx(SVW, VADD, (vbx_word_t*)v_t_result, 1, v_result); vbx(VVW, VCMV_GTZ, v_result, (vbx_word_t*)v_t_result, v_t_sub); #endif /* return (neg ? -result : result); */ vbx(SVW, VSUB, (vbx_word_t*)v_t_result, 0, v_result); vbx(VVW, VCMV_LTZ, v_result, (vbx_word_t*)v_t_result, v_x); vbx_sp_pop(); }
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N ) { typedef vbx_mm_t vbx_sp_t; const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; // Catch when N is very small if( N<4 ) { unsigned int i = 0; while(i<N) { dst[N-i-1]=src[i]; i++; } return VBW_SUCCESS; } vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; unsigned int FREE_BYTES = vbx_sp_getfree(); // Catch when N is small enough that cached scalar does a better job if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){ unsigned int i; vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t)); vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t)); for( i=0; i<N; i++ ) { B[N-i-1]=A[i]; } vbx_dcache_flush(B,N*sizeof(vbx_mm_t)); return VBW_SUCCESS; } unsigned int NUM_LANES = this_mxp->vector_lanes; unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B); unsigned int tile_size_w = tile_size_b/4; unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T; unsigned int num_tiles = N / tile_size_t; unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B; unsigned int tile_part_t = N - num_tiles * tile_size_t; unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX; if(tile_part_t){ vbx_sp_push(); vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); #if !VBX_SKIP_ALL_CHECKS if( !v_0 || !v_1) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t)); vbw_vec_reverse(v_1, v_0, tile_part_t); vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t)); dst += tile_part_t; vbx_sp_pop(); } if(!num_tiles) { return VBW_SUCCESS; } vbx_sp_push(); vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B); vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) }; vbx_word_t *result; #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[0] || !v_scratch[1] || !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif src += (num_tiles - 1) * tile_size_t; if( tile_size_w <= threshold_w) { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); if(VBW_ROT16){ vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w); }else{ vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w); } if( VBW_ROT8){ vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 ); } vbx_dma_to_host( dst, v_scratch[1], tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } else { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B, rows_per_tile, VBW_ROT16 ); if(VBW_ROT8){ vec_rot8_h( result, result, tile_size_w*2 ); } vbx_dma_to_host( dst, result, tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } vbx_sp_pop(); return VBW_SUCCESS; }
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N ) { const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int NUM_LANES = this_mxp->vector_lanes; //printf("\n%d\n",VBX_SKIP_ALL_CHECKS); // Can the whole vector fit in the scratchpad width? if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){ vbx_set_vl( 1 ); vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+N-1, v_src); return VBW_SUCCESS; } unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX); unsigned int N_w = N >> VBW_RSHIFT_T_TO_W; // Equivalent number of words in the vector if( N_w && N_w <= threshold_w ) { if( VBW_ROT16){ // remainder of elements that can't add to a whole word unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T); if( stub_t ) { vbx_set_vl( 1 ); vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t); v_dst += stub_t; } vec_rev_rot16_w(v_dst, v_src, N_w); }else{ vec_rev_w(v_dst, v_src, N_w); } if( VBW_ROT8){ vec_rot8_h(v_dst, v_dst, N_w*2); } return VBW_SUCCESS; } const unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; const unsigned int FREE_BYTES = vbx_sp_getfree(); const unsigned int ODD_LOG_SEL = NUM_LANES & 0x55555555 ? 1 : 0; vbx_word_t *v_mask, *v_result; vbx_word_t *v_scratch[2] = {0,0}; unsigned int num_rows_w = N_w / NUM_LANES; unsigned int working_set_w = num_rows_w * NUM_LANES; unsigned int tail_t = N - (working_set_w << VBW_LSHIFT_W_TO_T); unsigned int remaining_w = working_set_w; if( tail_t ) { vbx_set_vl( 1 ); vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t); v_dst += tail_t; } vbx_word_t *v_src_w = (vbx_word_t *)v_src; vbx_word_t *v_dst_w = (vbx_word_t *)v_dst; if(!num_rows_w) { return VBW_SUCCESS; } remaining_w = working_set_w; while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) { if( remaining_w <= threshold_w*2 ) { if( VBW_ROT16){ vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w); }else{ vec_rev_w(v_dst_w, v_src_w, remaining_w); } if( VBW_ROT8){ vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2); } return VBW_SUCCESS; } working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES ); v_mask = v_dst_w + (working_set_w*2); remaining_w -= working_set_w; v_scratch[0] = v_dst_w; v_scratch[1] = v_dst_w + working_set_w; num_rows_w = working_set_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, working_set_w*2); } v_dst_w += working_set_w; } vbx_sp_push(); v_scratch[0] = v_dst_w; v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) ); #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[1] ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B ); #if !VBX_SKIP_ALL_CHECKS if( !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif num_rows_w = remaining_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, remaining_w*2); } vbx_sp_pop(); return VBW_SUCCESS; }
/** Luma Edge Detection * * @brief 3x3 Sobel edge detection with 32-bit aRGB image * * @param[out] output 32-bit aRGB edge-intensity output * @param[in] input 32-bit aRGB input * @param[in] image_width Image width in pixels * @param[in] image_height Image height in pixels * @param[in] image_pitch Distance in pixels between the start of subsequent rows. usually equal to image_width * @param[in] renorm Number of bits to shift the final intensity by to the right * @returns Negative on error condition. See vbw_exit_codes.h */ int vbw_sobel_argb32_3x3_partial(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm) { int y; vbx_uword_t *v_row_in; vbx_uhalf_t *v_luma_top, *v_luma_mid, *v_luma_bot; vbx_uword_t *v_row_out; vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot; vbx_uhalf_t *v_gradient_x, *v_gradient_y; vbx_uhalf_t *v_tmp; void *tmp_ptr; vbx_sp_push(); // Allocate space in scratchpad for vectors struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t), input,input+image_pitch*image_width, image_pitch*sizeof(vbx_uword_t)); v_luma_top = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_luma_mid = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_luma_bot = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_top = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_mid = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_bot = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_row_out = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t)); if(v_row_out==NULL){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } // Re-use v_sobel_row_bot as v_tmp v_tmp = v_sobel_row_bot; // Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations rp_fetch(&v_row_db); rp_fetch(&v_row_db); v_row_in=rp_get_buffer(&v_row_db,0); vbw_rgb2luma(v_luma_top, v_row_in, v_tmp, image_width); // 1st luma row vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top, image_width); // 1st partial sobel row rp_fetch(&v_row_db); v_row_in=rp_get_buffer(&v_row_db,0); vbw_rgb2luma(v_luma_mid, v_row_in, v_tmp, image_width); // 2nd luma row vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width); // 2nd partial sobel row // Set top output row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t)); // Calculate edges for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) { // Transfer the next input row while processing rp_fetch(&v_row_db); v_row_in=rp_get_buffer(&v_row_db,0); // Re-use v_sobel_row_bot as v_tmp v_tmp = v_sobel_row_bot; // Convert aRGB input to luma vbw_rgb2luma(v_luma_bot, v_row_in, v_tmp, image_width); // Done with v_row_in; re-use for v_gradient_x and v_gradient_y (be careful!) v_gradient_x = (vbx_uhalf_t *)v_row_in; v_gradient_y = (vbx_uhalf_t *)v_row_in + image_width; // Calculate gradient_x // Apply [1 2 1]T matrix to all columns vbx_set_vl(image_width); vbx(SVHU, VSHL, v_gradient_x, 1, v_luma_mid); // multiply by 2 vbx(VVHU, VADD, v_tmp, v_luma_top, v_luma_bot); vbx(VVHU, VADD, v_tmp, v_tmp, v_gradient_x); // For each column, calculate absolute difference with 2nd column to the right vbx_set_vl(image_width-2); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2); // Calculate gradient_y // Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot); // Re-use v_sobel_row_top as v_tmp v_tmp = v_sobel_row_top; // sum of absoute gradients vbx_set_vl(image_width-2); vbx(VVHU, VADD, v_tmp, v_gradient_x, v_gradient_y); vbx(SVHU, VSHR, v_tmp, renorm, v_tmp); // Threshold vbx(SVHU, VSUB, v_gradient_y, 255, v_tmp); vbx(SVHU, VCMV_LTZ, v_tmp, 255, v_gradient_y); // Copy the result to the low byte of the output row // Trick to copy the low byte (b) to the middle two bytes as well // Note that first and last columns are 0 vbx_set_vl(image_width-2); vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp); // DMA the result to the output (minus the outside two pixels vbx_dma_to_host(output+(y+1)*image_pitch+1, v_row_out+1, (image_width-2)*sizeof(vbx_uword_t)); // Rotate luma buffers tmp_ptr = (void *)v_luma_top; v_luma_top = v_luma_mid; v_luma_mid = v_luma_bot; v_luma_bot = (vbx_uhalf_t *)tmp_ptr; // Rotate v_sobel_row buffers (for gradient_y) tmp_ptr = (void *)v_sobel_row_top; v_sobel_row_top = v_sobel_row_mid; v_sobel_row_mid = v_sobel_row_bot; v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr; } // Set bottom row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t)); vbx_sp_pop(); return VBW_SUCCESS; }
/** Luma Edge Detection. * * @brief 3x3 Sobel edge detection with 8-bit luma image * * @param[out] output 32-bit aRGB edge-intensity output * @param[in] input 8-bit luma input * @param[in] image_width Image width in pixels * @param[in] image_height Image height in pixels * @param[in] image_pitch Distance in pixels between the start of subsequent rows. usually equal to image_width * @param[in] renorm Number of bits to shift the final intensity by to the right * @returns Negative on error condition. See vbw_exit_codes.h */ int vbw_sobel_luma8_3x3(unsigned *output, unsigned char *input, const short image_width, const short image_height, const short image_pitch, const short renorm) { int y; vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot; vbx_uword_t *v_row_out; vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot; vbx_uhalf_t *v_gradient_x, *v_gradient_y; vbx_uhalf_t *v_tmp; void *tmp_ptr; vbx_sp_push(); // Allocate space in scratchpad for vectors rotating_prefetcher_t v_luma=rotating_prefetcher(3,image_width*sizeof(vbx_ubyte_t), input,input+image_height*image_pitch, image_pitch*sizeof(vbx_ubyte_t)); v_sobel_row_top = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_mid = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_sobel_row_bot = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_gradient_x = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_gradient_y = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); v_row_out = (vbx_uword_t *)vbx_sp_malloc(image_width*sizeof(vbx_uword_t)); if(v_row_out==NULL) { vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } // Transfer the first 3 input rows and interleave first 2 sobel row calculations rp_fetch(&v_luma); rp_fetch(&v_luma); v_luma_top=rp_get_buffer(&v_luma, 0); vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top,image_width); rp_fetch(&v_luma); v_luma_mid=rp_get_buffer(&v_luma, 1); vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width); // Set top output row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t)); // Calculate edges for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) { // Transfer the next input row while processing rp_fetch(&v_luma); v_luma_top=rp_get_buffer(&v_luma,0); v_luma_mid=rp_get_buffer(&v_luma,1); v_luma_bot=rp_get_buffer(&v_luma,2); // Start calculating gradient_x vbx_set_vl(image_width); vbx(SVBHU, VSHL, v_gradient_x, 1, v_luma_mid); // multiply by 2 // Calculate gradient_y // Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot); // Re-use v_sobel_row_top v_tmp = v_sobel_row_top; // Finish calculating gradient_x // Apply [1 2 1]T matrix to all columns vbx_set_vl(image_width); vbx(VVBHU, VADD, v_tmp, v_luma_top, v_luma_bot); vbx(VVHU, VADD, v_tmp, v_tmp, v_gradient_x); // For each column, calculate absolute difference with 2nd column to the right vbx_set_vl(image_width-2); vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2); // sum of absoute gradients //vbx_set_vl(image_width-2); vbx(VVHU, VADD, v_tmp, v_gradient_x, v_gradient_y); vbx(SVHU, VSHR, v_tmp, renorm, v_tmp); // Threshold vbx(SVHU, VSUB, v_gradient_y, 255, v_tmp); vbx(SVHU, VCMV_LTZ, v_tmp, 255, v_gradient_y); // Copy the result to the low byte of the output row // Trick to copy the low byte (b) to the middle two bytes as well // Note that first and last columns are 0 //vbx_set_vl(image_width-2); vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp); // DMA the result to the output vbx_dma_to_host(output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t)); // Rotate v_sobel_row buffers (for gradient_y) tmp_ptr = (void *)v_sobel_row_top; v_sobel_row_top = v_sobel_row_mid; v_sobel_row_mid = v_sobel_row_bot; v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr; } // Set bottom row to 0 vbx_set_vl(image_width); vbx(SVWU, VMOV, v_row_out, 0, 0); vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t)); vbx_sync(); vbx_sp_pop(); return VBW_SUCCESS; }
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS ) { typedef vbx_mm_t vbx_sp_t; int elements = INROWS * INCOLS; if(elements < SCALAR_THRESHOLD) { vbx_sync(); //in case we input is waiting on a DMA transfer int i,j; for(i = 0; i < INROWS; i++) { for(j = 0; j < INCOLS; j++) { out[j*INROWS+i] = in[i*INCOLS+j]; } } return VBW_SUCCESS; } vbx_sp_push(); vbx_sp_t *v_in; vbx_sp_t *v_out; int tile_height = 0; int tile_width = 0; int prev_tile_width = 0; int tile_y = 0; int tile_x = 0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; int SP_SIZE = vbx_sp_getfree(); int max_sp_elements = vbx_sp_getfree() / sizeof(vbx_sp_t); int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t); if( INROWS == 1 || INCOLS == 1 ) { // 1D transpose becomes a simple copy operation if( elements <= max_sp_elements ) { // We can use the whole scratchpad for this v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // To test this, you'll need a very large 1D matrix (or a small SP) tile_width = max_sp_elements; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) ); for (tile_x = 0; tile_x < elements; tile_x += tile_width) { if( tile_x + tile_width > elements) tile_width = elements - tile_x; vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) ); } } } else if( elements < max_tile_elements ) { // Matrix is small enough to handle entirely in SP v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); vbw_mtx_xp(v_out,v_in,INROWS,INCOLS); vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // At this point we know at least one full tile will be needed #define QUICK_A_LANES_THRESHOLD 8 // Use merge transpose if there are at least this many lanes #define QUICK_A_TILE_WIDTH 128 #define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH) #define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2) #define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t))) #define QUICK_B_LANES_THRESHOLD 16 // Use smaller merge transpose tile only if there are a lot of lanes #define QUICK_B_TILE_WIDTH 64 // and only if larger tile A size cannot be used. #define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH) #define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2) #define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t))) int NUM_LANES = this_mxp->vector_lanes; int DMA_BYTES = this_mxp->dma_alignment_bytes; int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t); vbx_sp_t *v_out_sel; vbx_sp_t *vf = 0; if( NUM_LANES >= QUICK_A_LANES_THRESHOLD // Check for appropriate conditions to use merge transpose tiles && INCOLS >= QUICK_A_TILE_WIDTH && INROWS >= QUICK_A_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_A_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t)); } else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD && INCOLS >= QUICK_B_TILE_WIDTH && INROWS >= QUICK_B_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_B_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t)); } else { findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim ); } prev_tile_width = tile_width; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); if( v_out==NULL ) { vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } vbx_sp_t *v[2] = { v_in, v_out }; tile_y = 0; // Reset y position for new col while( tile_y < INROWS ) { vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); tile_x = 0; // Reset x position for new row while( tile_x < INCOLS ) { vbx_dma_to_vector_2D( v_in, in+(tile_y*INCOLS)+tile_x, tile_width*sizeof(vbx_mm_t), tile_height, tile_width*sizeof(vbx_sp_t), INCOLS*sizeof(vbx_mm_t) ); v_out_sel = v_out; // select v_out as default vector to DMA to MM /* *** merge transpose (matrix must be square and a power of 2 wide) *** */ if( vf && tile_width == tile_height && (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) { int src = 0; int n; for( n=1; n<tile_width; n *= 2 ) { // can't do 1st iteration until entire tile is DMA'd in const int nn = 2*n; // copy the destination matrix vbx_set_vl( tile_width*tile_width ); // use v_in & v_out as working matrices (clobber v_in) vbxx( VMOV, v[!src], v[src]); // do the work vbx_set_vl( n*tile_width ); vbxx( VAND, vf, n, (vbx_enum_t*)0 ); // mask for merging: 0101010... then 00110011... vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 ); vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n , vf ); vbxx_2D( VCMV_Z, v[!src]+n, v[src]+n*tile_width, vf ); src = !src; } v_out_sel = v[src]; // depending on the size of the mtx, the final result may be in v_in or v_out } else { vbx_set_vl( 1 ); // 2D and 3D will be set by the x and y edge conditions, even using merge vbxx_3D(VMOV, v_out, v_in ); } vbx_dma_to_host_2D( out+(tile_x*INROWS)+tile_y, v_out_sel, tile_height*sizeof(vbx_mm_t), tile_width, INROWS*sizeof(vbx_mm_t), tile_height*sizeof(vbx_sp_t) ); tile_x += tile_width; // Set up width for next tile if( tile_x + tile_width > INCOLS ) { // Temporarily reduce tile width when reaching right edge of matrix tile_width = INCOLS - tile_x; vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); } } tile_y += tile_height; // Set up width and height for next row of tiles tile_width = prev_tile_width; // Restore original tile width for next row of tiles /* *** Permanently reduce tile height when reaching bottom of matrix *** */ tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height; } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }
int vbw_bifilt_argb32_3x3(unsigned *output, unsigned *input, short image_width, const short image_height, const short image_pitch, const short renorm) { //return vbw_sobel_argb32_3x3( output, input, image_width, image_height, image_pitch, renorm); int y; int xx, yy, sharp; vbx_uword_t *v_row_in; vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot; vbx_ubyte_t *v_luma_hii, *v_luma_low; vbx_ubyte_t *v_src[W][W]; vbx_uword_t *v_row_out; vbx_ubyte_t *v00, *v01, *v02, *v10, *v11, *v12, *v20, *v21, *v22; #if W==5 vbx_ubyte_t *v03, *v04, *v13, *v14, *v23, *v24; vbx_ubyte_t *v30, *v31, *v32, *v40, *v41, *v42; vbx_ubyte_t *v33, *v34, *v43, *v44; #endif vbx_ubyte_t *v[W][W]; vbx_uhalf_t *vI, *vW, *vT; // vT== temporary vbx_sp_push(); // Allocate space in scratchpad for vectors struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t), input,input+image_height*image_pitch, image_pitch*sizeof(vbx_uword_t)); v_row_out = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t)); vT = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); #if 1 // save some space by overlapping with v_row_out vW = (vbx_uhalf_t*)v_row_out; vI = (vbx_uhalf_t*)v_row_out + image_width; #else vW = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); vI = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t)); #endif #if W==3 v_luma_top = (vbx_ubyte_t*)vbx_sp_malloc( 3 * image_width*sizeof(vbx_ubyte_t)); v_luma_mid = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ; v_luma_bot = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ; #else v_luma_top = (vbx_ubyte_t*)vbx_sp_malloc( 5 * image_width*sizeof(vbx_ubyte_t)); v_luma_hii = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ; v_luma_mid = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ; v_luma_low = v_luma_top + 3 * image_width*sizeof(vbx_ubyte_t) ; v_luma_bot = v_luma_top + 4 * image_width*sizeof(vbx_ubyte_t) ; #endif if(v_luma_bot==NULL){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } // Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations #if W==3 rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma(vW, v_row_in, vT, image_width); // 1st luma row vbx( SVHBU, VSHR, v_luma_top, 8, vW ); // convert to byte v_row_in = rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_mid, 8, vW ); // convert to byte #else rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma(vW, v_row_in, vT, image_width); // 1st luma row vbx( SVHBU, VSHR, v_luma_top, 8, vW ); // convert to byte rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_hii, 8, vW ); // convert to byte rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_mid, 8, vW ); // convert to byte rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); vbw_rgb2luma( vW, v_row_in, vT, image_width); // 2nd luma row vbx( SVHBU, VSHR, v_luma_low, 8, vW ); // convert to byte #endif // blank out the top and bottom rows unsigned *out; vbx_set_vl(image_width); unsigned COLOUR = ( 200 | (128<<8) | (244<<16) ); vbx(SVWU, VMOV, v_row_out, COLOUR, 0); for( y=0; y<W/2; y++ ) { // Set top output rows to 0 out = output + image_width*y; vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) ); // Set bottom rows to 0 out = output + image_width*(image_height-1-y); vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) ); } // Calculate edges for (y = 0; y < image_height-(W-1); y++) { vbx_set_vl(image_width); // Transfer the next input row while processing rp_fetch(&v_row_db); v_row_in = rp_get_buffer(&v_row_db,0); // Convert aRGB input to luma vbw_rgb2luma( vW, v_row_in, vT, image_width); vbx( SVHBU, VSHR, v_luma_bot, 8, vW ); // convert to byte vbx_sp_push(); image_width=image_width/2; vbx_set_vl(image_width); v[0][0] = v00 = (vbx_ubyte_t*)vbx_sp_malloc( 25 * image_width*sizeof(vbx_ubyte_t)); v[0][1] = v01 = v00 + 1 * image_width*sizeof(vbx_ubyte_t) ; v[0][2] = v02 = v00 + 2 * image_width*sizeof(vbx_ubyte_t) ; v[1][0] = v10 = v00 + 3 * image_width*sizeof(vbx_ubyte_t) ; v[1][1] = v11 = v00 + 4 * image_width*sizeof(vbx_ubyte_t) ; v[1][2] = v12 = v00 + 5 * image_width*sizeof(vbx_ubyte_t) ; v[2][0] = v20 = v00 + 6 * image_width*sizeof(vbx_ubyte_t) ; v[2][1] = v21 = v00 + 7 * image_width*sizeof(vbx_ubyte_t) ; v[2][2] = v22 = v00 + 8 * image_width*sizeof(vbx_ubyte_t) ; #if W==5 v[0][3] = v03 = v00 + 9 * image_width*sizeof(vbx_ubyte_t) ; v[0][4] = v04 = v00 + 10 * image_width*sizeof(vbx_ubyte_t) ; v[1][3] = v13 = v00 + 11 * image_width*sizeof(vbx_ubyte_t) ; v[1][4] = v14 = v00 + 12 * image_width*sizeof(vbx_ubyte_t) ; v[2][3] = v23 = v00 + 13 * image_width*sizeof(vbx_ubyte_t) ; v[2][4] = v24 = v00 + 14 * image_width*sizeof(vbx_ubyte_t) ; v[3][0] = v30 = v00 + 15 * image_width*sizeof(vbx_ubyte_t) ; v[3][1] = v31 = v00 + 16 * image_width*sizeof(vbx_ubyte_t) ; v[3][2] = v32 = v00 + 17 * image_width*sizeof(vbx_ubyte_t) ; v[3][3] = v33 = v00 + 18 * image_width*sizeof(vbx_ubyte_t) ; v[3][4] = v34 = v00 + 19 * image_width*sizeof(vbx_ubyte_t) ; v[4][0] = v40 = v00 + 20 * image_width*sizeof(vbx_ubyte_t) ; v[4][1] = v41 = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ; v[4][2] = v42 = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ; v[4][3] = v43 = v00 + 23 * image_width*sizeof(vbx_ubyte_t) ; v[4][4] = v44 = v00 + 24 * image_width*sizeof(vbx_ubyte_t) ; #endif if(v00==NULL){ printf("mem alloc failed\n"); fflush(stdout); vbx_sp_pop(); vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } //FIXME -- how to manage row buffers with 5 rows? 3 rows are shown below: #if W==3 for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx; for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_mid+xx; for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_bot+xx; #else for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx; for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_hii+xx; for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_mid+xx; for( xx=0; xx<W; xx++ ) v_src[3][xx] = v_luma_low+xx; for( xx=0; xx<W; xx++ ) v_src[4][xx] = v_luma_bot+xx; #endif vbx_set_vl( image_width - W + 1 ); // compute error (absdiff) in pixel colour with neighbours for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( VVBU, VABSDIFF, v[yy][xx], v_luma_mid+(W/2), v_src[yy][xx] ); } } // v[][] holds the errors (differences) between pixels // efficiently compute a function that looks approximately something like exp(-x): // large value for small errors, small value for big errors for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( SVBU, VABSDIFF, v[yy][xx], 255, v[yy][xx] ); // 255 - img_err // 11 or more iterations is mathematically equivalent to a pure gaussian blur // FIXME is this true? #define NUM_SHARPEN_ITERATIONS 3 // 0 to 10 iterations, practical max is 7 or 8 for( sharp=0; sharp < NUM_SHARPEN_ITERATIONS; sharp++ ) { vbx( VVBU, VMULHI, v[yy][xx], v[yy][xx], v[yy][xx] ); // v*v; } } } // with right decimal place, could do the next two instructions using MULFXP and do as BYTES // convolve errors with gaussian blur kernel for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v[yy][xx] ); } } // sum up the weights for normalization later vbx( VVBHU, VADD, vW, v[0][0], v[0][1] ); vbx( VVBHU, VADD, vT, v[0][2], v[1][0] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[1][1], v[1][2] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[2][0], v[2][1] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[2][2], 0 ); vbx( VVHU, VADD, vW, vW, vT ); #if (W==5) vbx( VVBHU, VADD, vT, v[3][0], v[3][1] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[3][2], v[4][0] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VADD, vT, v[4][1], v[4][2] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] ); vbx( VVHU, VADD, vW, vW, vT ); vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] ); vbx( VVHU, VADD, vW, vW, vT ); #endif // convolve image with new weights for( yy=0; yy<W; yy++ ) { for( xx=0; xx<W; xx++ ) { vbx( VVBU, VMULHI, v[yy][xx], v_src[yy][xx], v[yy][xx] ); //vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v_src[yy][xx] ); //vbx( SVBU, VMUL , v[yy][xx], 1 , v_src[yy][xx] ); } } // sum up the weighted pixels vbx( VVBHU, VADD, vI, v[0][0], v[0][1] ); vbx( VVBHU, VADD, vT, v[0][2], v[1][0] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[1][1], v[1][2] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[2][0], v[2][1] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[2][2], 0 ); vbx( VVHU, VADD, vI, vI, vT ); #if (W==5) vbx( VVBHU, VADD, vT, v[3][0], v[3][1] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[3][2], v[4][0] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VADD, vT, v[4][1], v[4][2] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] ); vbx( VVHU, VADD, vI, vI, vT ); vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] ); vbx( VVHU, VADD, vI, vI, vT ); #endif // keep RHS of image as original grayscale image_width=image_width*2; vbx_set_vl( image_width/2 ); //vbx( VVWHU, VMOV, vT+image_width/2, (v_row_in ) + image_width/2+1, 0 ); vbx( VVBHU, VMOV, vT+image_width/2, (v_src[ 0 ][ 0 ]) + image_width/2+1, 0 ); vbx_sp_pop(); // don't need v[][] data any more // compute LHS of image #if 0 vbx( VVBHU, VMOV, vT, v_src[2][2], 0 ); //vbx( SVHU, VSHR, vI, 3, vI ); //vbx( SVHU, VSHR, vW, 3, vW ); //vbx( VVHU, VMUL, vT, vI, vW ); //vbx( SVHU, VSHR, vT, 8, vT ); #else uint32_t h = image_width/2; vbx( SVHU, VADD, vW, 0x80, vW ); // round vbx( SVHU, VSHR, vW, 8, vW ); vbw_vec_divide_uhalf( vT , vI , vW , h ); //vbw_vec_divide_uhalf( vT+h, vI+h, vW+h, image_width-W+1-h ); #endif // ensure LHS doesn't overflow vbx( SVHU, VAND, vT, 0xff, vT ); // Copy the result to the low byte of the output row // Trick to copy the low byte (b) to the middle two bytes as well // Note that first and last columns are 0 vbx_set_vl(image_width-W+1); vbx(SVHWU, VMULLO, v_row_out+W/2, 0x00010101, vT); // blank out left and right edges // then DMA the result to the output vbx_set_vl(W/2); vbx(SVWU, VMOV, v_row_out, COLOUR, 0 ); vbx(SVWU, VMOV, v_row_out + image_width - (W/2), COLOUR, 0 ); vbx_dma_to_host( output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t) ); // Rotate luma buffers vbx_ubyte_t *tmp_ptr; tmp_ptr = v_luma_top; #if W==3 v_luma_top = v_luma_mid; v_luma_mid = v_luma_bot; v_luma_bot = tmp_ptr; #else v_luma_top = v_luma_hii; v_luma_hii = v_luma_mid; v_luma_mid = v_luma_low; v_luma_low = v_luma_bot; v_luma_bot = tmp_ptr; #endif } vbx_sync(); vbx_sp_pop(); return VBW_SUCCESS; }
int vbw_mtx_median_ext_argb32( unsigned *output, unsigned *input, const int filter_height, const int filter_width, const int image_height, const int image_width, const int image_pitch ) { const int FREE_BYTES = vbx_sp_getfree(); int l,k; int filter_mid, filter_size; int rows_per_l,vl,temp_vl, temp_vl_byte; int j,i; int partial_row = 0; filter_size = filter_height*filter_width; filter_mid = filter_size/2; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_WIDTH_BYTES = this_mxp->scratchpad_alignment_bytes; // Could possibly check for low SP here (less than 6*VBX_WIDTH_BYTES) and assign vl differently // During allocation, max additional SP bytes needed due to alignment is one VBX_WIDTH_BYTES per vector // Taking that off the top simplifies calculation and will always be correct, but sacrifices a little SP space vl = (FREE_BYTES-3*VBX_WIDTH_BYTES)/((filter_size+2)*sizeof(vbx_uword_t)); if( vl < 1 ) { return VBW_ERROR_SP_ALLOC_FAILED; } if(vl < image_width){ rows_per_l = 1; partial_row = 1; } else { rows_per_l = vl/image_width; vl = image_width*rows_per_l; } vbx_sp_push(); vbx_uword_t *v_input = (vbx_uword_t *)vbx_sp_malloc(filter_size*vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_sub = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_temp = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_min, *v_max; vbx_ubyte_t *v_input_byte = (vbx_ubyte_t *)v_input; if( v_temp == NULL ){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } for(l = 0; l < image_height-filter_height; l+= rows_per_l){ // detect last pass if(l+rows_per_l > image_height-filter_height){ rows_per_l = (image_height-filter_height)-l; vl = image_width*rows_per_l; } temp_vl = vl; for(k = 0; k < image_width; k += temp_vl){ if(partial_row){ if(k + temp_vl > image_width){ temp_vl = image_width - k; } } for(j = 0; j < filter_height; j++){ vbx_dma_to_vector_2D(v_input+temp_vl*j, input+(l+j)*image_pitch+k, temp_vl/rows_per_l*sizeof(vbx_uword_t), rows_per_l, image_width*sizeof(vbx_uword_t), image_pitch*sizeof(vbx_uword_t)); } // arrange all pixels within a filter window into single columns, seperated by temp_vl // // ex. vl = 5, filter = 3 // vinput before vinput after // // a00 a01 a02 a03 a04 | a00 a01 a02 a03 a04 | // a10 a11 a12 a13 a14 | a10 a11 a12 a13 a14 | // a20 a21 a22 a23 a24 | a20 a21 a22 a23 a24 | // ??? ??? ??? ??? ??? | a01 a02 a03 a04 a10 | // ??? ??? ??? ??? ??? | a11 a12 a13 a14 a20 | // ??? ??? ??? ??? ??? | a21 a22 a23 a24 a30 | // ??? ??? ??? ??? ??? | a02 a03 a04 a10 a11 | // ??? ??? ??? ??? ??? | a12 a13 a14 a20 a21 | // ??? ??? ??? ??? ??? | a22 a23 a24 a30 a31 | // vbx_set_vl(temp_vl); for(j = 1; j < filter_height; j++){ for(i = 0; i < filter_width; i++){ vbx(VVWU, VMOV, v_input+(j*filter_height+i)*temp_vl, v_input+i*temp_vl+j, 0); } } //Do the bubble sort up to the filter_size/2^th element on each vbx // work on individual color channels temp_vl_byte = temp_vl*sizeof(vbx_uword_t)/sizeof(vbx_ubyte_t); vbx_set_vl(temp_vl_byte); // sort lower half of the values in the window for(j = 0; j < filter_mid; j++){ v_min = v_input_byte+j*temp_vl_byte; for(i = j+1; i < filter_size; i++){ v_max = v_input_byte+i*temp_vl_byte; vbx(VVBU, VMOV, v_temp, v_min, 0); vbx(VVBU, VSUB, v_sub, v_max, v_min); vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub); vbx(VVBU, VCMV_LTZ, v_max, v_temp, v_sub); } } // grab next smallest value, the median, don't sort the rest v_min = v_input_byte+filter_mid*temp_vl_byte; for(i = filter_mid+1; i < filter_size; i++){ v_max = v_input_byte+i*temp_vl_byte; vbx(VVBU, VSUB, v_sub, v_max, v_min); vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub); } // dma out median value // back to pixels vbx_dma_to_host_2D(output+(l*image_pitch)+k, v_input+temp_vl*filter_mid, temp_vl/rows_per_l*sizeof(vbx_uword_t), rows_per_l, image_pitch*sizeof(vbx_uword_t), image_width*sizeof(vbx_uword_t)); } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }