/** VBX Motion Estimation, using vbx_3d ops. * vbw_mtx_motest_3D_byte_setup should be run prior to running this function. * Using bytes as input data. block_height must be an even number. * * @param[out] result * @param[in] x * @param[in] y * @param[in] m * @returns negative on error condition. See vbw_exit_codes.h */ int vbw_mtx_motest_3d_byte(output_type *result, input_type* x, input_type *y, vbw_motest_t *m) { int l,j; int sub_block_width = m->block_width+m->search_width; for( j = 0; j < m->block_height; j++ ) { vbx_dma_to_vector( m->v_block+j*m->block_width, x+j*m->image_width, m->block_width*sizeof(input_type) ); } for( j = 0; j < m->block_height+m->search_height; j++ ) { vbx_dma_to_vector( m->v_img+j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) ); } vbx_set_3D( m->search_width, m->block_height*sizeof(intermediate_type), sizeof(input_type), 0 ); for( l = 0; l < m->search_height; l++ ) { //Accumulate each row into a vbx of row SADs vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height, sizeof(intermediate_type), sub_block_width*sizeof(input_type), m->block_width*sizeof(input_type) ); vbx_acc_3D( VVBHU, VABSDIFF, m->v_row_sad, m->v_img+l*sub_block_width, m->v_block ); //Accumulate the SADs vbx_set_vl( m->block_height/2 ); vbx_set_2D( m->search_width, sizeof(output_type), m->block_height*sizeof(intermediate_type), m->block_height*sizeof(intermediate_type) ); vbx_acc_2D( VVHWU, VADD, (vbx_uword_t*)m->v_result+l*m->search_width, m->v_row_sad, m->v_row_sad+(m->block_height/2) ); //Transfer the line to host vbx_dma_to_host( result+l*m->search_width, m->v_result+l*m->search_width, m->search_width*sizeof(output_type) ); } return VBW_SUCCESS; }
int main(void) { vbx_test_init(); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int required_vectors = 4; int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors; int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH ); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf( "\nAdd test...\n" ); printf( "Vector length: %d\n", N ); vbx_mm_t *scalar_in1 = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_in2 = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in1 = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in2 = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); // vbx_mm_t *vector_out = vector_in2 - 5; vbx_sp_t *v_in1 = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_in2 = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); // vbx_sp_t *v_out = v_in2-5; VBX_T(test_zero_array)( scalar_out, N ); VBX_T(test_zero_array)( vector_out, N ); VBX_T(test_init_array)( scalar_in1, N, 1 ); VBX_T(test_copy_array)( vector_in1, scalar_in1, N ); VBX_T(test_init_array)( scalar_in2, N, 1 ); VBX_T(test_copy_array)( vector_in2, scalar_in2, N ); VBX_T(test_print_array)( scalar_in1, PRINT_LENGTH ); VBX_T(test_print_array)( scalar_in2, PRINT_LENGTH ); scalar_time = test_scalar( scalar_out, scalar_in1, scalar_in2, N ); VBX_T(test_print_array)( scalar_out, PRINT_LENGTH); vbx_dma_to_vector( v_in1, (void *)vector_in1, N*sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in2, (void *)vector_in1, N*sizeof(vbx_sp_t) ); vector_time = test_vector( v_out, v_in1, v_in2, N, scalar_time ); vbx_dma_to_host( (void *)vector_out, v_out, N*sizeof(vbx_sp_t) ); vbx_sync(); VBX_T(test_print_array)( vector_out, PRINT_LENGTH ); errors += VBX_T(test_verify_array)( scalar_out, vector_out, N ); VBX_TEST_END(errors); return 0; }
double test_vector_sp(vbx_mm_t *vector_out, vbx_mm_t *vector_in1, int IN1ROWS, int IN1COLS, vbx_mm_t *vector_in2, int IN2ROWS, int IN2COLS, double scalar_time ) { typedef vbx_mm_t vbx_sp_t; int retval=-1; vbx_timestamp_t time_start, time_stop; printf( "\nExecuting MXP matrix multiply... src1[%dx%d] src2[%dx%d]\n",IN1ROWS, IN1COLS,IN2ROWS, IN2COLS ); vbx_timestamp_start(); time_start = vbx_timestamp(); vbx_sp_push(); vbx_sp_t* v_in1=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN1COLS); vbx_sp_t* v_in2=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN2ROWS*IN2COLS); vbx_sp_t* v_out=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN2COLS); if(v_out!=NULL){ vbx_dma_to_vector(v_in1,vector_in1,sizeof(vbx_sp_t)*IN1ROWS*IN1COLS); vbx_dma_to_vector(v_in2,vector_in2,sizeof(vbx_sp_t)*IN2ROWS*IN2COLS); retval = vbw_mtx_mul( v_out, v_in1, IN1ROWS, IN1COLS, v_in2, IN2ROWS, IN2COLS ); vbx_dma_to_host(vector_out,v_out,sizeof(vbx_sp_t)*IN1ROWS*IN2COLS); vbx_sync(); }else{ printf("not enough sp space for sp test"); } time_stop = vbx_timestamp(); printf( "...done. retval:0x%08X\n", retval ); return vbx_print_vector_time( time_start, time_stop, scalar_time ); }
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int width, int height, int max_print_errors) { int j, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(img, 0, width, height); vbx_ubyte_t* v_in = (vbx_ubyte_t*)vbx_sp_malloc(3*width*sizeof(vbx_word_t)); vbx_ubyte_t* v_top = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_ubyte_t* v_bot = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lbp = v_bot; unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); vbx_set_vl(width); for(j=0; j < height - 2; j++){ vbx_dma_to_vector(v_in, img+j*width, 3*width*sizeof(unsigned char)); vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+width); vbx(VVHU, VCUSTOM1, v_bot, v_in+width, v_in+2*width); vbx(SVHBU, VAND, v_top, 0xf0, v_top); vbx(SVHBU, VAND, v_bot, 0x0f, v_bot); vbx(VVBU, VADD, v_lbp, v_bot, v_top); vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char)); vbx_sync(); errors = match_array_byte(lbp, scalar_patterns[0]+j*width, "custom_lbp", width-2, 1, max_print_errors, 1, j); } vbx_sp_free(); vbx_shared_free(lbp); return errors; }
void vector_rectangle_to_luma( pixel *input_buffer, vbx_uhalf_t *v_luma_buffer, vbx_uhalf_t *v_row_temp, vbx_uword_t *v_row, int startx, int starty, int width, int height, const int image_pitch ) { int y; vbx_uhalf_t *v_luma; vbx_set_vl(width); for(y = 0; y < height; y++){ v_luma = v_luma_buffer+(y*width); vbx_dma_to_vector(v_row, input_buffer+((y+starty)*image_pitch)+startx, width*sizeof(vbx_uword_t)); //Move the b component into v_luma vbx(SVWHU, VAND, v_luma, 0xFF, v_row); vbx(SVHU, VMUL, v_luma, 25, v_luma); //Move g into v_row_temp and add it to v_luma vbx(SVWHU, VAND, v_row_temp, 0xFF, (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+1)); vbx(SVHU, VMUL, v_row_temp, 129, v_row_temp); vbx(VVHU, VADD, v_luma, v_luma, v_row_temp); //Move r into v_row_temp and add it to v_luma vbx(SVWHU, VAND, v_row_temp, 0xFF, (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+2)); vbx(SVHU, VMUL, v_row_temp, 66, v_row_temp); vbx(VVHU, VADD, v_luma, v_luma, v_row_temp); //divide by 2^8 vbx(SVHU, VSHR, v_luma, 8, v_luma); } }
int dma_bandwidth_test() { const int num_iter = 64; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int scratchpad_size = this_mxp->scratchpad_size; uint8_t *buf = vbx_shared_malloc(scratchpad_size); vbx_ubyte_t *v_buf = vbx_sp_malloc(scratchpad_size); vbx_timestamp_t time_start, time_stop; int i; int len; int to_host; int errors = 0; vbx_mxp_print_params(); // dma_alignment_bytes gives DMA master data bus width in bytes. double bytes_per_sec = \ (((double) this_mxp->core_freq) * this_mxp->dma_alignment_bytes); double max_megabytes_per_sec = bytes_per_sec/(1024*1024); printf("\nMax available bandwidth = %s Megabytes/s\n", vbx_eng(max_megabytes_per_sec, 4)); printf("\n"); for (to_host = 0; to_host < 2; to_host++) { for (len = 32; len <= scratchpad_size ; len *= 2) { printf("DMA %s, %d bytes\n", to_host ? "write" : "read", len); vbx_timestamp_start(); if (to_host) { time_start = vbx_timestamp(); for (i = 0; i < num_iter; i++) { vbx_dma_to_host(buf, v_buf, len); } vbx_sync(); time_stop = vbx_timestamp(); } else { time_start = vbx_timestamp(); for (i = 0; i < num_iter; i++) { vbx_dma_to_vector(v_buf, buf, len); } vbx_sync(); time_stop = vbx_timestamp(); } print_dma_bandwidth(time_start, time_stop, len, num_iter, max_megabytes_per_sec); printf("\n"); } printf("\n"); } vbx_shared_free(buf); vbx_sp_free(); return errors; }
vbx_mtx_fdct_t * vbx_mtx_fdct_init( dt *coeff_v, dt *image ) { const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE; const int num_bytes = BIG_TILE_SIZE * sizeof(dt); const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt); //compute coeffs matrix in double and truncated to dt int i, j; double s; for (i = 0; i < BLOCK_SIZE; i++) { s = (i == 0) ? sqrt(0.125) : 0.5; for (j = 0; j < BLOCK_SIZE; j++) { c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5)); cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999); } } vbx_sp_push(); vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) ); v->vcoeff = (vbx_half_t *)vbx_sp_malloc( co_bytes ); v->vprods = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #if USE_ACCUM_FLAGS v->vaccum = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vflags = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #endif // interleave ordering to ensure no false hazards v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); if( !v->vblock[1] ) { VBX_PRINTF( "ERROR: out of memory.\n" ); VBX_EXIT(-1); } vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes ); int row; for( row=0; row < BLOCK_SIZE; row++ ) { getBigTileImageY(v->vimage[v->db],image,row); } #if USE_ACCUM_FLAGS // create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( SEH, VAND, v->vflags, BLOCK_SIZE-1, 0 ); #endif return v; }
double test_vector_power( vbx_word_t *vector_out, vbx_word_t *vector_in1, vbx_word_t *vector_in2, int N, double scalar_time ) { int retval; vbx_timestamp_t time_start, time_stop; printf("\nExecuting MXP vector software power..."); vbx_word_t *v_out = vbx_sp_malloc( N*sizeof(vbx_word_t) ); vbx_word_t *v_in1 = vbx_sp_malloc( N*sizeof(vbx_word_t) ); vbx_word_t *v_in2 = vbx_sp_malloc( N*sizeof(vbx_word_t) ); vbx_dma_to_vector( v_in1, vector_in1, N*sizeof(vbx_word_t) ); vbx_dma_to_vector( v_in2, vector_in2, N*sizeof(vbx_word_t) ); vbx_timestamp_start(); time_start = vbx_timestamp(); retval = vbw_vec_power_word( v_out, v_in1, v_in2, N ); vbx_sync(); time_stop = vbx_timestamp(); vbx_dma_to_host( vector_out, v_out, N*sizeof(vbx_word_t) ); vbx_sync(); printf("done. retval:%X\n",retval); return vbx_print_vector_time(time_start, time_stop, scalar_time); }
/** VBX Motion Estimation. * Similar to the scalar version but scans vertically as it makes it easier to align vectors. * vbw_mtx_motest_byte_setup should be run prior to running this function. * * @param[out] result * @param[in] x * @param[in] y * @param[in] m * @returns negative on error condition. See vbw_exit_codes.h */ int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m) { int j; int sub_block_width = m->block_width+m->search_width; for( j = 0; j < m->block_height; j++ ) { vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) ); } for( j = 0; j < m->block_height+m->search_height; j++ ) { vbx_dma_to_vector( m->v_img +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) ); } // column-ize the reference block vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 ); vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 ); //Do column by column for( j=0; j < m->search_width; j++ ) { // column-ize the search image vbx_set_vl( m->block_width ); vbx_set_2D( m->block_height+m->search_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 ); vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 ); // search the image columnwise vbx_set_vl( m->block_width*m->block_height ); vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0, m->block_width*sizeof(input_type) ); vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub ); } // Write back result vbx_dma_to_host( result, m->v_result, m->result_size ); return VBW_SUCCESS; }
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int log, int width, int height, int max_print_errors) { int j, l, cell, max_cell, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(img, log, width, height); max_cell = 1<<log; vbx_uhalf_t* v_in = (vbx_uhalf_t*)vbx_sp_malloc((1+2*max_cell)*width*sizeof(vbx_half_t)); vbx_uhalf_t* v_top = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t)); vbx_uhalf_t* v_bot = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t)); vbx_ubyte_t* v_lbp = (vbx_ubyte_t*)v_bot; unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); vbx_set_vl(width); for(l = 0; l < 1; l++){ cell = 1<<l; for(j=0; j < height - 2*cell; j++){ vbx_dma_to_vector(v_in, img+j*width, (1+2*cell)*width*sizeof(unsigned short)); vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+(1*cell)*width); vbx(VVHU, VCUSTOM1, v_bot, v_in+(1*cell)*width, v_in+(2*cell)*width); vbx(SVHBU, VAND, (vbx_ubyte_t*)v_top, 0xf0, v_top); vbx(SVHBU, VAND, (vbx_ubyte_t*)v_bot, 0x0f, v_bot); vbx(VVBU, VADD, v_lbp, v_bot, v_top); vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char)); vbx_sync(); errors += match_array_byte(lbp, scalar_patterns[l]+j*width, "custom_lbp", width-2*cell, 1, 0, max_print_errors, 1, j); if (errors > max_print_errors){ max_print_errors = 0; } } } vbx_sp_free(); vbx_shared_free(lbp); return errors; }
//FIXME stride for match not implemented int compare_LBPPassStage_to_restricted(unsigned short *vbx_img, int log, lbp_stage_t lbp_stage, int window, int width, int height, int max_print_errors) { int l, i, j, cell, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(vbx_img, log, width, height); unsigned char *pass, *vbx_pass; pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char)); vbx_pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char)); vbx_byte_t** v_lbp =(vbx_byte_t**)vbx_shared_malloc((log+1)*sizeof(vbx_byte_t*)); for (l=0; l<log+1; l++) { v_lbp[l] = (vbx_byte_t*)vbx_sp_malloc((window+1)*width*sizeof(vbx_byte_t)); } vbx_byte_t* v_lut = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_byte_t* v_stage = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_byte_t* v_pattern; lbp_feat_t feat; int dx, dy, dw, f; for (l=0; l<log+1; l++) { vbx_dma_to_vector(v_lbp[l]+width, scalar_patterns[l], (window)*width*sizeof(unsigned char)); } vbx_sync(); for(j=0; j < height-(window+1); j++) { for (l=0; l<log+1; l++) { vbx_set_vl(width * window); vbx(VVB, VMOV, v_lbp[l], v_lbp[l]+width, NULL); vbx_dma_to_vector(v_lbp[l] + window*width, scalar_patterns[l]+(j+window)*width, width*sizeof(unsigned char)); } vbx_set_vl(width-(window+1)); vbx(SVB, VMOV, v_stage, 0, NULL); for (f = 0; f < lbp_stage.count; f++) { feat = lbp_stage.feats[f]; dx = feat.pos.src.x; dy = feat.pos.src.y; dw = feat.pos.size.x; v_pattern = v_lbp[dw>>1]+(dy*width+dx); vbx(SVBU, VLBPLUT, v_lut, f, v_pattern); vbx(VVB, VADD, v_stage, v_stage, v_lut); } vbx(SVB, VMOV, v_lut, 0, NULL); vbx(SVB, VCMV_GEZ, v_lut, 1, v_stage); vbx_dma_to_host(vbx_pass + j*width, v_lut, (width-(window+1))*sizeof(unsigned char)); vbx_sync(); } unsigned int *iImg, *iiImg; iImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int)); iiImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int)); gen_integrals(vbx_img, iImg, iiImg, width, height); image_t lbp_img = {iImg, {width, height}}; for (j = 0; j < height - (window + 1); j++) { for (i = 0; i < width - (window + 1); i++) { pair_t lbp_p = {i, j}; pass[j*width+i] = LBPPassStage(lbp_img, lbp_stage, lbp_p); } } /* test pass vs vbx pass */ for (j = 0; j < height - (window + 1); j++) { errors += match_array_byte(vbx_pass + j*width, pass + j*width, "pass stage", width - (window + 1), 1, 0, max_print_errors, 1, j); if (errors > max_print_errors){ max_print_errors = 0; } } return errors; }
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N ) { typedef vbx_mm_t vbx_sp_t; const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; // Catch when N is very small if( N<4 ) { unsigned int i = 0; while(i<N) { dst[N-i-1]=src[i]; i++; } return VBW_SUCCESS; } vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; unsigned int FREE_BYTES = vbx_sp_getfree(); // Catch when N is small enough that cached scalar does a better job if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){ unsigned int i; vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t)); vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t)); for( i=0; i<N; i++ ) { B[N-i-1]=A[i]; } vbx_dcache_flush(B,N*sizeof(vbx_mm_t)); return VBW_SUCCESS; } unsigned int NUM_LANES = this_mxp->vector_lanes; unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B); unsigned int tile_size_w = tile_size_b/4; unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T; unsigned int num_tiles = N / tile_size_t; unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B; unsigned int tile_part_t = N - num_tiles * tile_size_t; unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX; if(tile_part_t){ vbx_sp_push(); vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); #if !VBX_SKIP_ALL_CHECKS if( !v_0 || !v_1) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t)); vbw_vec_reverse(v_1, v_0, tile_part_t); vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t)); dst += tile_part_t; vbx_sp_pop(); } if(!num_tiles) { return VBW_SUCCESS; } vbx_sp_push(); vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B); vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) }; vbx_word_t *result; #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[0] || !v_scratch[1] || !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif src += (num_tiles - 1) * tile_size_t; if( tile_size_w <= threshold_w) { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); if(VBW_ROT16){ vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w); }else{ vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w); } if( VBW_ROT8){ vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 ); } vbx_dma_to_host( dst, v_scratch[1], tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } else { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B, rows_per_tile, VBW_ROT16 ); if(VBW_ROT8){ vec_rot8_h( result, result, tile_size_w*2 ); } vbx_dma_to_host( dst, result, tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } vbx_sp_pop(); return VBW_SUCCESS; }
//vector version of rgb converter void vector_blend( output_pointer img_out, input_pointer img_in1, input_pointer img_in2, unsigned int num_row, unsigned int num_column, intermediate_type blending_const ) { intermediate_type *v_img1[2]; input_type *v_img2[2]; intermediate_type *v_temp; intermediate_type blending_const_bar = 256-blending_const; int j; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int VBX_WIDTH_BYTES = this_mxp->vector_lanes * sizeof(int); const int VBX_DMA_ALIGNMENT = this_mxp->dma_alignment_bytes; unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type))); chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT ); unsigned int chunk_size_old = chunk_size; unsigned int vector_length = chunk_size; unsigned int vector_length_old = vector_length; v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img2[0] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_img2[1] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_temp = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); if( v_temp == NULL ) { VBX_EXIT(0xBADDEAD); } int bufselect = 0; vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) ); for( j=0; j<num_row*num_column; j+=vector_length_old ) { vbx_set_vl(vector_length); if( j > 0 ) { vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) ); } if( (j+vector_length_old) < (num_row*num_column-1) ) { if( (j+vector_length_old*2) >= num_row*num_column ) { vector_length = num_row*num_column - j - vector_length_old; chunk_size = vector_length; } vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) ); } vbx( SVBHU, VMULLO, v_temp, blending_const, v_img1[bufselect] ); vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] ); vbx( VVHU, VADD, v_img1[bufselect], v_img1[bufselect], v_temp ); vbx( SVHBU, VSHR, v_img1[bufselect], 8, v_img1[bufselect] ); bufselect = 1-bufselect; } vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) ); vbx_sp_free(); vbx_sync(); }
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS ) { typedef vbx_mm_t vbx_sp_t; int elements = INROWS * INCOLS; if(elements < SCALAR_THRESHOLD) { vbx_sync(); //in case we input is waiting on a DMA transfer int i,j; for(i = 0; i < INROWS; i++) { for(j = 0; j < INCOLS; j++) { out[j*INROWS+i] = in[i*INCOLS+j]; } } return VBW_SUCCESS; } vbx_sp_push(); vbx_sp_t *v_in; vbx_sp_t *v_out; int tile_height = 0; int tile_width = 0; int prev_tile_width = 0; int tile_y = 0; int tile_x = 0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; int SP_SIZE = vbx_sp_getfree(); int max_sp_elements = vbx_sp_getfree() / sizeof(vbx_sp_t); int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t); if( INROWS == 1 || INCOLS == 1 ) { // 1D transpose becomes a simple copy operation if( elements <= max_sp_elements ) { // We can use the whole scratchpad for this v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // To test this, you'll need a very large 1D matrix (or a small SP) tile_width = max_sp_elements; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) ); for (tile_x = 0; tile_x < elements; tile_x += tile_width) { if( tile_x + tile_width > elements) tile_width = elements - tile_x; vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) ); } } } else if( elements < max_tile_elements ) { // Matrix is small enough to handle entirely in SP v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); vbw_mtx_xp(v_out,v_in,INROWS,INCOLS); vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // At this point we know at least one full tile will be needed #define QUICK_A_LANES_THRESHOLD 8 // Use merge transpose if there are at least this many lanes #define QUICK_A_TILE_WIDTH 128 #define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH) #define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2) #define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t))) #define QUICK_B_LANES_THRESHOLD 16 // Use smaller merge transpose tile only if there are a lot of lanes #define QUICK_B_TILE_WIDTH 64 // and only if larger tile A size cannot be used. #define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH) #define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2) #define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t))) int NUM_LANES = this_mxp->vector_lanes; int DMA_BYTES = this_mxp->dma_alignment_bytes; int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t); vbx_sp_t *v_out_sel; vbx_sp_t *vf = 0; if( NUM_LANES >= QUICK_A_LANES_THRESHOLD // Check for appropriate conditions to use merge transpose tiles && INCOLS >= QUICK_A_TILE_WIDTH && INROWS >= QUICK_A_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_A_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t)); } else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD && INCOLS >= QUICK_B_TILE_WIDTH && INROWS >= QUICK_B_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_B_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t)); } else { findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim ); } prev_tile_width = tile_width; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); if( v_out==NULL ) { vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } vbx_sp_t *v[2] = { v_in, v_out }; tile_y = 0; // Reset y position for new col while( tile_y < INROWS ) { vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); tile_x = 0; // Reset x position for new row while( tile_x < INCOLS ) { vbx_dma_to_vector_2D( v_in, in+(tile_y*INCOLS)+tile_x, tile_width*sizeof(vbx_mm_t), tile_height, tile_width*sizeof(vbx_sp_t), INCOLS*sizeof(vbx_mm_t) ); v_out_sel = v_out; // select v_out as default vector to DMA to MM /* *** merge transpose (matrix must be square and a power of 2 wide) *** */ if( vf && tile_width == tile_height && (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) { int src = 0; int n; for( n=1; n<tile_width; n *= 2 ) { // can't do 1st iteration until entire tile is DMA'd in const int nn = 2*n; // copy the destination matrix vbx_set_vl( tile_width*tile_width ); // use v_in & v_out as working matrices (clobber v_in) vbxx( VMOV, v[!src], v[src]); // do the work vbx_set_vl( n*tile_width ); vbxx( VAND, vf, n, (vbx_enum_t*)0 ); // mask for merging: 0101010... then 00110011... vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 ); vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n , vf ); vbxx_2D( VCMV_Z, v[!src]+n, v[src]+n*tile_width, vf ); src = !src; } v_out_sel = v[src]; // depending on the size of the mtx, the final result may be in v_in or v_out } else { vbx_set_vl( 1 ); // 2D and 3D will be set by the x and y edge conditions, even using merge vbxx_3D(VMOV, v_out, v_in ); } vbx_dma_to_host_2D( out+(tile_x*INROWS)+tile_y, v_out_sel, tile_height*sizeof(vbx_mm_t), tile_width, INROWS*sizeof(vbx_mm_t), tile_height*sizeof(vbx_sp_t) ); tile_x += tile_width; // Set up width for next tile if( tile_x + tile_width > INCOLS ) { // Temporarily reduce tile width when reaching right edge of matrix tile_width = INCOLS - tile_x; vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); } } tile_y += tile_height; // Set up width and height for next row of tiles tile_width = prev_tile_width; // Restore original tile width for next row of tiles /* *** Permanently reduce tile height when reaching bottom of matrix *** */ tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height; } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }
int vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int start_x, int start_y, int reset, const int image_width, const int image_height, const int image_pitch) { int y, x, starty, startx; unsigned int sad, sad_min, y_min, x_min; vbx_uhalf_t *v_search_luma, *v_last_luma; vbx_uhalf_t *v_row_temp; vbx_uword_t *v_row; vbx_uword_t *v_sad; pixel color; if(*last_luma == NULL || reset){ init_vector_motest(input_buffer, last_luma, motest_x, motest_y, start_x, start_y, image_pitch); } v_search_luma = vbx_sp_malloc( MOTEST_BUFFER_SIZE * sizeof(vbx_uhalf_t) ); v_last_luma = vbx_sp_malloc( MOTEST_BLOCK_SIZE * sizeof(vbx_uhalf_t) ); v_row_temp = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uhalf_t) ); v_row = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uword_t) ); v_sad = vbx_sp_malloc( MOTEST_SEARCH_SIZE * sizeof(vbx_uword_t) ); if(v_sad == NULL){ printf("Not enough scratchpad for motest\n"); while(1); } startx = *motest_x-(MOTEST_SEARCH_WIDTH/2); starty = *motest_y-(MOTEST_SEARCH_HEIGHT/2); if(startx < 0){ startx = 0; } if(startx > image_width-MOTEST_BUFFER_WIDTH){ startx = image_width-MOTEST_BUFFER_WIDTH; } if(starty < 0){ starty = 0; } if(starty > image_height-MOTEST_BUFFER_HEIGHT){ starty = image_height-MOTEST_BUFFER_HEIGHT; } vector_rectangle_to_luma(input_buffer, v_search_luma, v_row_temp, v_row, startx, starty, MOTEST_BUFFER_WIDTH, MOTEST_BUFFER_HEIGHT, image_pitch); vbx_dma_to_vector(v_last_luma, *last_luma, MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t)); //Vector compute sad here vbx_set_2D(MOTEST_BLOCK_HEIGHT, sizeof(vbx_uword_t), MOTEST_BUFFER_WIDTH*sizeof(vbx_uhalf_t), MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t)); for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ vbx_set_vl(MOTEST_BLOCK_WIDTH); vbx_acc_2D(VVHWU, VABSDIFF, v_row, v_search_luma+(y*MOTEST_BUFFER_WIDTH)+x, v_last_luma); vbx_set_vl(MOTEST_BLOCK_HEIGHT/2); vbx_acc(VVWU, VADD, v_sad+(y*MOTEST_SEARCH_WIDTH)+x, v_row, v_row+MOTEST_BLOCK_HEIGHT/2); } #if TOUCHSCREEN #ifdef TOUCH_INTERRUPTS_VBX if (touchscreen_get_pen(pTouch)) { vbx_sp_free(); return -1; } #endif #endif } vbx_sync(); sad_min = INT_MAX; y_min = *motest_y; x_min = *motest_x; for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ sad = v_sad[y*MOTEST_SEARCH_WIDTH+x]; if(sad < sad_min){ sad_min = sad; x_min = x+startx; y_min = y+starty; } else if(sad == sad_min) { if( (abs( x - MOTEST_SEARCH_WIDTH/2) + abs( y - MOTEST_SEARCH_HEIGHT/2)) < (abs((x_min-startx) - MOTEST_SEARCH_WIDTH/2) + abs((y_min-starty) - MOTEST_SEARCH_HEIGHT/2))) { x_min = x+startx; y_min = y+starty; } } } } color.r = 0; color.g = 255; color.b = 0; color.a = 0; scalar_draw_line(*motest_x+(MOTEST_BLOCK_WIDTH/2), *motest_y+(MOTEST_BLOCK_HEIGHT/2), x_min+(MOTEST_BLOCK_WIDTH/2), y_min+(MOTEST_BLOCK_HEIGHT/2), color, input_buffer, image_pitch); *motest_y = y_min; *motest_x = x_min; vbx_set_vl(MOTEST_BLOCK_WIDTH); for(y = 0; y < MOTEST_BLOCK_HEIGHT; y++){ vbx(VVHU, VMOV, v_last_luma+(y*MOTEST_BLOCK_WIDTH), v_search_luma+((y+y_min-starty)*MOTEST_BUFFER_WIDTH)+(x_min-startx), 0); } vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type)); draw_motest(input_buffer, *motest_x, *motest_y, image_pitch); //simple hack to draw thicker draw_motest(input_buffer, *motest_x+1, *motest_y+1, image_pitch); vbx_sp_free(); return 0; }
int test_lbp_ci(unsigned short* img, int width, int height) { vbx_uhalf_t* v_a1 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b1 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_1h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_a2 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b2 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_2h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_a4 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b4 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_4h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_ubyte_t* v_1b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); vbx_ubyte_t* v_2b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); vbx_ubyte_t* v_4b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); unsigned short* lbp1h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned short* lbp2h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned short* lbp4h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned char* lbp1b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); unsigned char* lbp2b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); unsigned char* lbp4b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); img = img + width; vbx_dma_to_vector(v_a1, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b1, img + width, width*sizeof(unsigned short)); vbx_dma_to_vector(v_a2, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b2, img + width, width*sizeof(unsigned short)); vbx_dma_to_vector(v_a4, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b4, img + width, width*sizeof(unsigned short)); vbx_sync(); int i; int m = 48; for(i=0; i<m; i++){ v_a1[i] = 0; v_b1[i] = 0; v_a2[i] = 0; v_b2[i] = 0; v_a4[i] = 0; v_b4[i] = 0; } int n = 12; int src_a1[] = {0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_b1[] = {0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_a2[] = {0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_b2[] = {0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_a4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0}; int src_b4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0}; for(i=0; i<16; i++){ v_a1[i] = src_a1[i]; v_b1[i] = src_b1[i]; v_a2[i] = src_a2[i]; v_b2[i] = src_b2[i]; v_a4[i] = src_a4[i]; v_b4[i] = src_b4[i]; } vbx_set_vl(width); vbx(VVHU, VCUSTOM1, v_1h, v_a1, v_b1); vbx(VVHU, VCUSTOM2, v_2h, v_a2, v_b2); vbx(VVHU, VCUSTOM3, v_4h, v_a4, v_b4); vbx(VVHB, VADD, v_1b, v_1h, ((vbx_byte_t*)v_1h) + 1); vbx(VVHB, VADD, v_2b, v_2h, ((vbx_byte_t*)v_2h) + 1); vbx(VVHB, VADD, v_4b, v_4h, ((vbx_byte_t*)v_4h) + 1); vbx_dma_to_host(lbp1h, v_1h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp2h, v_2h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp4h, v_4h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp1b, v_1b, width*sizeof(unsigned char)); vbx_dma_to_host(lbp2b, v_2b, width*sizeof(unsigned char)); vbx_dma_to_host(lbp4b, v_4b, width*sizeof(unsigned char)); vbx_sync(); test_print_array_half(v_a1, n); test_print_array_half(v_b1, n); test_print_hex_array_half(lbp1h, n); test_print_hex_array_byte(lbp1b, n); test_print_array_half(v_a2, n); test_print_array_half(v_b2, n); test_print_hex_array_half(lbp2h, n); test_print_hex_array_byte(lbp2b, n); test_print_array_half(v_a4, n); test_print_array_half(v_b4, n); test_print_hex_array_half(lbp4h, n); test_print_hex_array_byte(lbp4b, n); vbx_sp_free(); vbx_shared_free(lbp1h); vbx_shared_free(lbp2h); vbx_shared_free(lbp4h); vbx_shared_free(lbp1b); vbx_shared_free(lbp2b); vbx_shared_free(lbp4b); return 0; }
/* takes in precomputed bfly */ static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real) { int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num; short wr, wi; vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp; vptr_half v_twr, v_twi; vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip; vptr_half v_temp; vptr_half v_twr2, v_twi2; const int n = 1 << m; const int half = n >> 1; scale = 0; mul = 0; swap = m >> 1; l = m-1; flight = 1; bfly = half; const int INROWS = 1<<swap; const int INCOLS = 1<<(m-swap); if ( !(m%2) ){ swap--; } // allocate space in vector memory for vectors v_fr = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_twr = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_temp = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); if( v_fr == NULL || v_fi == NULL || v_fr2 == NULL || v_fi2== NULL || \ v_twr == NULL || v_twi == NULL || v_temp == NULL) { VBX_EXIT(-1); } v_twr2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); if( v_twr2 == NULL || v_twi2 == NULL) { VBX_EXIT(-1); } vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) ); #if 1 if(real){ vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse); } #endif while (l > swap) { if (inverse) { // variable scaling, depending upon data shift = 0; if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) { shift = 1; scale++; } } else { // fixed scaling, for proper normalization // -- overall factor of 1/n, distributed to maximize arithmetic accuracy shift = 1; } // shift will be performed on each data point exactly once during pass SWAP( v_fr, v_fr2, v_tmp ); SWAP( v_fi, v_fi2, v_tmp ); if (shift){ vbx_set_vl( n ); vbx(SVH,VSHR, v_fr2, 1, v_fr2 ); vbx(SVH,VSHR, v_fi2, 1, v_fi2 ); } vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fr[n>>1], v_fr2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fi2, v_twi ); vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp ); vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fi[n>>1], v_fi2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fr2, v_twi ); vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VADD, &v_fi[n>>1], &v_fi[n>>1], v_temp ); l--; mul++; flight <<= 1; if( l > swap ) { vbx_set_vl( 1<<l ); vbx( VVWH, VMOV, v_twr, v_twr, 0 ); vbx( VVWH, VMOV, v_twi, v_twi, 0 ); } } if ( !(m%2) ) { l++; flight >>=1; }
int main(void) { vbx_test_init(); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int required_vectors = 4; int N = VBX_PAD_DN(VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors, this_mxp->scratchpad_alignment_bytes); int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH ); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf( "\nVector copy test...\n" ); printf( "Vector length: %d\n", N ); vbx_mm_t *scalar_in = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_in = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); VBX_T(test_zero_array)( scalar_in, N ); VBX_T(test_zero_array)( vector_in, N ); VBX_T(test_init_array)( scalar_in, N, 1 ); VBX_T(test_copy_array)( vector_in, scalar_in, N ); scalar_time = test_scalar( scalar_out, scalar_in, N ); VBX_T(test_print_array)( scalar_out, PRINT_LENGTH ); vbx_dma_to_vector( v_in, vector_in, N*sizeof(vbx_sp_t) ); vector_time = test_vector( v_out, v_in, N, scalar_time ); vbx_dma_to_host(vector_out, v_out, N*sizeof(vbx_sp_t) ); vbx_sync(); VBX_T(test_print_array)( vector_out, PRINT_LENGTH ); errors += VBX_T(test_verify_array)( scalar_out, vector_out, N ); vbx_sp_free(); #if TEST_DEEP_SP errors += deep_vector_copy_test(); #endif #if DEBUG_MAKE_SP_FULL vbx_sp_malloc(vbx_sp_getfree()); #endif #if TEST_DEEP_MM errors += deep_vector_copy_ext_test(); #endif VBX_TEST_END(errors); return 0; }