vbx_void_t *vbx_sp_malloc_debug( int LINE,const char *FNAME, size_t num_bytes ) { // print pretty error messages vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); if( !this_mxp || !this_mxp->init ) { VBX_PRINTF( "ERROR: failed to call _vbx_init().\n" ); VBX_FATAL(LINE,FNAME,-1); } // pad to scratchpad width to reduce occurrence of false hazards size_t padded = VBX_PAD_UP( num_bytes, this_mxp->scratchpad_alignment_bytes ); size_t freesp = (size_t)(this_mxp->scratchpad_end - this_mxp->sp); //VBX_SCRATCHPAD_END - (size_t)vbx_sp; // vbx_sp_getfree(); vbx_void_t *result = NULL; if( VBX_DEBUG_LEVEL && (num_bytes==0) ) { print_sp_malloc_null(); } else if( VBX_DEBUG_LEVEL && freesp < padded ) { print_sp_malloc_full( num_bytes, padded ); } else if( num_bytes > 0 && freesp >= padded ) { result = this_mxp->sp; this_mxp->sp += padded; #if VBX_DEBUG_SP_MALLOC printf("sp_malloc %d bytes padded to %d, sp=0x%08x\n", num_bytes, padded, this_mxp->sp); #endif } if( !result ) { VBX_FATAL(LINE,FNAME,-1); } return result; }
int main(void) { vbx_test_init(); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int required_vectors = 4; int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors; int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH ); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf( "\nAdd test...\n" ); printf( "Vector length: %d\n", N ); vbx_mm_t *scalar_in1 = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_in2 = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in1 = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in2 = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); // vbx_mm_t *vector_out = vector_in2 - 5; vbx_sp_t *v_in1 = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_in2 = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); // vbx_sp_t *v_out = v_in2-5; VBX_T(test_zero_array)( scalar_out, N ); VBX_T(test_zero_array)( vector_out, N ); VBX_T(test_init_array)( scalar_in1, N, 1 ); VBX_T(test_copy_array)( vector_in1, scalar_in1, N ); VBX_T(test_init_array)( scalar_in2, N, 1 ); VBX_T(test_copy_array)( vector_in2, scalar_in2, N ); VBX_T(test_print_array)( scalar_in1, PRINT_LENGTH ); VBX_T(test_print_array)( scalar_in2, PRINT_LENGTH ); scalar_time = test_scalar( scalar_out, scalar_in1, scalar_in2, N ); VBX_T(test_print_array)( scalar_out, PRINT_LENGTH); vbx_dma_to_vector( v_in1, (void *)vector_in1, N*sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in2, (void *)vector_in1, N*sizeof(vbx_sp_t) ); vector_time = test_vector( v_out, v_in1, v_in2, N, scalar_time ); vbx_dma_to_host( (void *)vector_out, v_out, N*sizeof(vbx_sp_t) ); vbx_sync(); VBX_T(test_print_array)( vector_out, PRINT_LENGTH ); errors += VBX_T(test_verify_array)( scalar_out, vector_out, N ); VBX_TEST_END(errors); return 0; }
int main(void) { vbx_test_init(); vbx_mxp_print_params(); int errors=0; unsigned instr_cycles,instr_count, dma_cycles,dma_count; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int lanes= this_mxp->vector_lanes; int dma_width=this_mxp->dma_alignment_bytes /4; debug(lanes); debug(dma_width); vbx_set_vl(-1); VBX_COUNTER_RESET(); vbx(SVW,VMOV,0,0,0); vbx_sync(); if(VBX_SIMULATOR) printf("simulator\n"); else printf("not simulator\n"); instr_cycles=VBX_GET_WRITEBACK_CYCLES(); dma_cycles=VBX_GET_DMA_CYCLES(); dma_count=VBX_GET_DMAS(); instr_count=VBX_GET_INSTRUCTIONS(); debug(instr_cycles); debug(dma_cycles); debug(dma_count); debug(instr_count ); VBX_TEST_END(errors); return 0; }
vbx_void_t *vbx_sp_malloc_nodebug( size_t num_bytes ) { if( VBX_DEBUG_LEVEL && 0 ) { // print pretty error messages return vbx_sp_malloc_debug( __LINE__, __FILE__, num_bytes ); } // do it, but do not print pretty error messages vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); // check for valid argument values if( !this_mxp || num_bytes==0 ) return NULL; // add padding and allocate // pad to scratchpad width to reduce occurrence of false hazards size_t padded = VBX_PAD_UP( num_bytes, this_mxp->scratchpad_alignment_bytes ); vbx_void_t *old_sp = this_mxp->sp; this_mxp->sp += padded; // scratchpad full if( this_mxp->sp > this_mxp->scratchpad_end ) { this_mxp->sp = old_sp; return NULL; } // success return old_sp; }
int dma_bandwidth_test() { const int num_iter = 64; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int scratchpad_size = this_mxp->scratchpad_size; uint8_t *buf = vbx_shared_malloc(scratchpad_size); vbx_ubyte_t *v_buf = vbx_sp_malloc(scratchpad_size); vbx_timestamp_t time_start, time_stop; int i; int len; int to_host; int errors = 0; vbx_mxp_print_params(); // dma_alignment_bytes gives DMA master data bus width in bytes. double bytes_per_sec = \ (((double) this_mxp->core_freq) * this_mxp->dma_alignment_bytes); double max_megabytes_per_sec = bytes_per_sec/(1024*1024); printf("\nMax available bandwidth = %s Megabytes/s\n", vbx_eng(max_megabytes_per_sec, 4)); printf("\n"); for (to_host = 0; to_host < 2; to_host++) { for (len = 32; len <= scratchpad_size ; len *= 2) { printf("DMA %s, %d bytes\n", to_host ? "write" : "read", len); vbx_timestamp_start(); if (to_host) { time_start = vbx_timestamp(); for (i = 0; i < num_iter; i++) { vbx_dma_to_host(buf, v_buf, len); } vbx_sync(); time_stop = vbx_timestamp(); } else { time_start = vbx_timestamp(); for (i = 0; i < num_iter; i++) { vbx_dma_to_vector(v_buf, buf, len); } vbx_sync(); time_stop = vbx_timestamp(); } print_dma_bandwidth(time_start, time_stop, len, num_iter, max_megabytes_per_sec); printf("\n"); } printf("\n"); } vbx_shared_free(buf); vbx_sp_free(); return errors; }
void vbx_sp_free_nodebug() { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); if( this_mxp ) { this_mxp->sp = this_mxp->scratchpad_addr; this_mxp->spstack_top = 0; } }
// -------------------------------------------------------- // Scratchpad manipulation routines int vbx_sp_getused() { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int used = 0; if( this_mxp ) used = (int)(this_mxp->sp - this_mxp->scratchpad_addr); return used; }
int vbx_sp_getfree() { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int free = 0; if( this_mxp ) free = (int)(this_mxp->scratchpad_end - this_mxp->sp); return free; }
int deep_vector_copy_ext_test() { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int retval; int num_test; int total_errors = 0; const int NUM_TESTS = TEST_DEEP_MM_NUM_TESTS; int NB = this_mxp->scratchpad_size * 10; int NT = NB / sizeof(vbx_mm_t); vbx_mm_t *v = vbx_shared_malloc( NB ); srand( 0x1a84c92a ); int i; for( num_test=0; num_test < NUM_TESTS ; num_test++ ) { // initialize the whole working space for( i=0; i<NT; i++ ) { v[i] = i & MSK; } // choose random src/dest/length: // -- randomly pick the dest // -- set a window size of 2*K around the dest // -- randomly pick the src within the window // -- randomly pick the length, subject to end-of-scratchpad // -- this 'window' rule increases probability of overlaps // -- rough distribution: 30% short (pipeline) overlaps, 20% long overlaps, 50% no overlap int K, N1, N2, NN; N1 = rand() % NT; K = 1 + rand() % ((N1 > 0)? min(min(N1, NT-N1), 1024): min(NT, 1024)); N2 = N1 - K + rand() % (2*K); NN = rand() % (NT - max(N1,N2)); vbx_mm_t *dst = v + N1; vbx_mm_t *src = v + N2; printf("test:%d src:0x%08x dst:0x%08x len:%08d", num_test, N1, N2, NN ); // do the copy retval = VBX_T(vbw_vec_copy_ext)( dst, src, NN ); vbx_sync(); printf(" retval:0x%04x\n",retval); // ensure the copy was done properly int errors = verify_copy(v, 0, N1, 0, "head") + verify_copy(v, N1, NN+N1, (N2-N1), "copy") + verify_copy(v, NN+N1, NT, 0, "tail"); total_errors += errors; if( errors ) { //break; } } return total_errors; }
void vbx_sp_free_debug( int LINE, const char *FNAME ) { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); if( !this_mxp ) { VBX_PRINTF( "ERROR: failed to call _vbx_init().\n" ); VBX_FATAL(LINE,FNAME,-1); } else { this_mxp->sp = this_mxp->scratchpad_addr; this_mxp->spstack_top = 0; } }
int main(void) { vbx_test_init(); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; int N = VBX_SCRATCHPAD_SIZE/sizeof(vbx_word_t)/12; N=1024; int PRINT_LENGTH = min(N, MAX_PRINT_LENGTH); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf("\nVector power test...\n"); printf("Vector length: %d\n", N); vbx_word_t *scalar_in1 = malloc( N*sizeof(vbx_word_t) ); vbx_word_t *scalar_in2 = malloc( N*sizeof(vbx_word_t) ); vbx_word_t *scalar_out = malloc( N*sizeof(vbx_word_t) ); vbx_word_t *vector_in1 = vbx_shared_malloc( N*sizeof(vbx_word_t) ); vbx_word_t *vector_in2 = vbx_shared_malloc( N*sizeof(vbx_word_t) ); vbx_word_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_word_t) ); if(vector_out==NULL){ printf("malloc_failed\n"); return 1; } test_zero_array_word( scalar_out, N ); test_zero_array_word( vector_out, N ); test_init_array_word( scalar_in1, N, 5 ); test_copy_array_word( vector_in1, scalar_in1, N ); test_init_array_word( scalar_in2, N, 112 ); test_copy_array_word( vector_in2, scalar_in2, N ); test_print_array_word( scalar_in1, PRINT_LENGTH ); test_print_array_word( scalar_in2, PRINT_LENGTH ); scalar_time = test_scalar_power( scalar_out, scalar_in1, scalar_in2, N); test_print_array_word( scalar_out, PRINT_LENGTH ); vector_time = test_vector_power( vector_out, vector_in1, vector_in2, N, scalar_time ); test_print_array_word( vector_out, PRINT_LENGTH ); errors += test_verify_array_word( scalar_out, vector_out, N ); VBX_TEST_END(errors); return 0; }
void vbx_sp_push_realloc(){ vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); //double the stack space this_mxp->spstack_max*=2; size_t spstack_size=this_mxp->spstack_max*sizeof(void*); printf("realloc sp_stack %d\n",this_mxp->spstack_max); this_mxp->spstack=(void**)realloc((void*)this_mxp->spstack,spstack_size); if ( !this_mxp->spstack ) { VBX_PRINTF("ERROR: Failed to malloc %d bytes for spstack.\n", (int)spstack_size); VBX_FATAL(__LINE__, __FILE__, -1); } }
void vbx_sp_set_nodebug( vbx_void_t *new_sp ) { if( VBX_DEBUG_LEVEL ) { // print pretty error messages vbx_sp_set_debug( __LINE__, __FILE__, new_sp ); } // do it, but do not print pretty error messages vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); if( this_mxp && (this_mxp->scratchpad_addr <= new_sp && new_sp <= this_mxp->scratchpad_end) && VBX_IS_ALIGNED(new_sp, 4) ) { this_mxp->sp = new_sp; } }
void vbx_sp_set_debug( int LINE, const char *FNAME, vbx_void_t *new_sp ) { // print pretty error messages vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); if( !this_mxp ) { VBX_PRINTF( "ERROR: failed to call _vbx_init().\n" ); VBX_FATAL(LINE,FNAME,-1); } else if( (this_mxp->scratchpad_addr <= new_sp && new_sp <= this_mxp->scratchpad_end) && VBX_IS_ALIGNED(new_sp, 4) ) { this_mxp->sp = new_sp; } else { VBX_PRINTF( "ERROR: attempt to set scratchpad to illegal or unaligned address 0x%08lx.\n", (long int)new_sp ); VBX_FATAL(LINE,FNAME,-1); } }
int VBX_T(vbw_vec_reverse_test)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int i,k; vbx_timestamp_t start=0,finish=0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); NBYTES = sizeof(vbx_sp_t)*N; if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue; vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES ); vbx_sp_t *vdst = vbx_sp_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !vsrc ) VBX_EXIT(-1); if( !vdst ) VBX_EXIT(-1); #if ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF ) unsigned int mask = 0x007F; #elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF ) unsigned int mask = 0x7FFF; #else unsigned int mask = 0xFFFF; #endif vbx_set_vl( N ); vbx( SV(T), VMOV, vdst, -1, 0 ); // Fill the destination vector with -1 vbx( SE(T), VAND, vsrc, mask, 0 ); // Fill the source vector with enumerated values //VBX_T(print_vector)( "vsrcInit", vsrc, N ); //VBX_T(print_vector)( "vdstInit", vdst, N ); /** measure performance of function call **/ vbx_sync(); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N ); vbx_sync(); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); //VBX_T(print_vector)( "vsrcPost", vsrc, N ); //VBX_T(print_vector)( "vdstPost", vdst, N ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); vbx_set_vl( N ); vbx( SE(T), VAND, vsrc, mask, 0 ); // Reset the source vector /** measure performance of simple algorithm **/ vbx_sync(); vbx_set_vl( 1 ); vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 ); vbx_sync(); } finish = vbx_timestamp(); printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_sp_free(); } vbx_sp_free(); printf("All tests passed successfully.\n"); return 0; }
vbx_void_t *vbx_sp_get() { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); return this_mxp ? this_mxp->sp : NULL; }
int main(void) { vbx_test_init(); typedef vbx_word_t vbx_mm_t; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t ); N = 20; int M = 20; int PRINT_LENGTH = N<MAX_PRINT_LENGTH ? N : MAX_PRINT_LENGTH ; // int PRINT_ROWS = PRINT_LENGTH; int PRINT_ROWS = M<MAX_PRINT_LENGTH ? N : MAX_PRINT_LENGTH; int PRINT_COLS = PRINT_LENGTH; double scalar_time, vector_time,vector2_time; int errors=0; vbx_mxp_print_params(); printf( "\nMatrix multiply test...\n" ); printf( "Matrix dimensions: %d,%d\n", N, M ); vbx_mm_t *scalar_in1 = (vbx_mm_t*)malloc( M*N*sizeof(vbx_mm_t ) ); vbx_mm_t *scalar_in2 = (vbx_mm_t*)malloc( M*N*sizeof(vbx_mm_t ) ); vbx_mm_t *scalar_out = (vbx_mm_t*)malloc( N*N*sizeof(vbx_mm_t ) ); vbx_mm_t *vector_in1 = (vbx_mm_t*)vbx_shared_malloc( M*N*sizeof(vbx_mm_t ) ); vbx_mm_t *vector_in2 = (vbx_mm_t*)vbx_shared_malloc( M*N*sizeof(vbx_mm_t ) ); vbx_mm_t *vector_out = (vbx_mm_t*)vbx_shared_malloc( N*N*sizeof(vbx_mm_t ) ); if ( scalar_in1 == NULL || scalar_in2 == NULL || scalar_out == NULL || vector_in1 == NULL || vector_in2 == NULL || vector_out == NULL ){ printf("Malloc failed\n"); VBX_TEST_END(1); return 0; } test_zero_array_word(scalar_out, N*N ); test_zero_array_word(vector_out, N*N ); test_init_array_word( scalar_in1, M*N, 1 ); test_copy_array_word( vector_in1, scalar_in1, M*N ); test_init_array_word( scalar_in2, M*N, 999 ); //scalar_mtx_xp_MN_word( vector_in2, scalar_in2, N, N ); test_copy_array_word( vector_in2, scalar_in2, M*N ); test_print_matrix_word( scalar_in1, PRINT_COLS, PRINT_ROWS, M ); test_print_matrix_word( scalar_in2, PRINT_ROWS, PRINT_COLS, N ); //change print sizes for outputs PRINT_ROWS=PRINT_COLS=N<PRINT_LENGTH?N:PRINT_LENGTH; scalar_time = test_scalar( scalar_out, scalar_in1, N, M, scalar_in2, M, N); test_print_matrix_word( scalar_out, PRINT_COLS, PRINT_ROWS, N ); vector_time = test_vector( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time ); test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N ); errors += test_verify_array_word( scalar_out, vector_out, N*N); vector2_time = test_vector_trans( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time ); test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N ); errors += test_verify_array_word( scalar_out, vector_out, N*N); vector2_time = test_vector_sp( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time ); test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N ); errors += test_verify_array_word( scalar_out, vector_out, N*N); vbx_shared_free(vector_out); vbx_shared_free(vector_in2); vbx_shared_free(vector_in1); free(scalar_out); free(scalar_in2); free(scalar_in1); //errors += orig_test(); VBX_TEST_END(errors); return 0; }
//vector version of rgb converter void vector_blend( output_pointer img_out, input_pointer img_in1, input_pointer img_in2, unsigned int num_row, unsigned int num_column, intermediate_type blending_const ) { intermediate_type *v_img1[2]; input_type *v_img2[2]; intermediate_type *v_temp; intermediate_type blending_const_bar = 256-blending_const; int j; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int VBX_WIDTH_BYTES = this_mxp->vector_lanes * sizeof(int); const int VBX_DMA_ALIGNMENT = this_mxp->dma_alignment_bytes; unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type))); chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT ); unsigned int chunk_size_old = chunk_size; unsigned int vector_length = chunk_size; unsigned int vector_length_old = vector_length; v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img2[0] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_img2[1] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_temp = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); if( v_temp == NULL ) { VBX_EXIT(0xBADDEAD); } int bufselect = 0; vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) ); for( j=0; j<num_row*num_column; j+=vector_length_old ) { vbx_set_vl(vector_length); if( j > 0 ) { vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) ); } if( (j+vector_length_old) < (num_row*num_column-1) ) { if( (j+vector_length_old*2) >= num_row*num_column ) { vector_length = num_row*num_column - j - vector_length_old; chunk_size = vector_length; } vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) ); } vbx( SVBHU, VMULLO, v_temp, blending_const, v_img1[bufselect] ); vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] ); vbx( VVHU, VADD, v_img1[bufselect], v_img1[bufselect], v_temp ); vbx( SVHBU, VSHR, v_img1[bufselect], 8, v_img1[bufselect] ); bufselect = 1-bufselect; } vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) ); vbx_sp_free(); vbx_sync(); }
int compare_vbx_lut_to_vbx_lut_ci(int stage, int max_print_errors) { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int vci_lanes = this_mxp->vcustom0_lanes; int sz = this_mxp->scratchpad_size/(16*sizeof(vbx_ubyte_t)); vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); if(v_idx == NULL) { printf("failed to allocate in compare_vbx_lut_to_vbx_lut_ci\n"); } unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); int f, n, s, errors = 0; for (n = 0; n < sz; n++) { v_pattern[n] = (n & 0xff); } for (f = 0; f < face_lbp[stage].count; f++) { lbp_feat_t feat = face_lbp[stage].feats[f]; vbx_set_vl(sz); int total = f; s = 0; while(s < stage){ total += face_lbp[s].count; s++; } if(total < 256) { vbx(SVBU, VLBPLUT, v_lutc, total, v_pattern); } else { vbx(SVBS, VLBPLUT, v_lutc, total-256, v_pattern); } vbx(SVB, VMOV, v_pass, feat.fail, 0); /* check if pattern is in lut */ vbx(SVBU, VSHR, v_group, 5, v_pattern); for (n = 0; n < 8; n++) { vbx(SVB, VADD, v_sel, -n, v_group); vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel); } vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern); vbx(VVWB, VSHR, v_lut, v_idx, v_lut); vbx(SVB, VAND, v_lut, 1, v_lut); vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut); vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char)); vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char)); vbx_sync(); errors += match_array_byte(lut, lut_c, "custom_lut", sz, 1, 0, max_print_errors, 0, 0); } vbx_sp_free(); vbx_shared_free(lut); vbx_shared_free(lut_c); return errors; }
int main(void) { vbx_test_init(); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int required_vectors = 4; int N = VBX_PAD_DN(VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors, this_mxp->scratchpad_alignment_bytes); int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH ); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf( "\nVector copy test...\n" ); printf( "Vector length: %d\n", N ); vbx_mm_t *scalar_in = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_in = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); VBX_T(test_zero_array)( scalar_in, N ); VBX_T(test_zero_array)( vector_in, N ); VBX_T(test_init_array)( scalar_in, N, 1 ); VBX_T(test_copy_array)( vector_in, scalar_in, N ); scalar_time = test_scalar( scalar_out, scalar_in, N ); VBX_T(test_print_array)( scalar_out, PRINT_LENGTH ); vbx_dma_to_vector( v_in, vector_in, N*sizeof(vbx_sp_t) ); vector_time = test_vector( v_out, v_in, N, scalar_time ); vbx_dma_to_host(vector_out, v_out, N*sizeof(vbx_sp_t) ); vbx_sync(); VBX_T(test_print_array)( vector_out, PRINT_LENGTH ); errors += VBX_T(test_verify_array)( scalar_out, vector_out, N ); vbx_sp_free(); #if TEST_DEEP_SP errors += deep_vector_copy_test(); #endif #if DEBUG_MAKE_SP_FULL vbx_sp_malloc(vbx_sp_getfree()); #endif #if TEST_DEEP_MM errors += deep_vector_copy_ext_test(); #endif VBX_TEST_END(errors); return 0; }
int main_tile() { int i, j, k, l, base, block_num; int x, y; int time_start, time_stop; unsigned int cycles; double vbx_time, scalar_time; int wrong; int total_errors = 0; //all of the initialization can be hard coded without any computation vbx_mtx_fdct_t *v = vbx_mtx_fdct_init( coeff_v, image ); vbx_timestamp_start(); printf("\nGenerating initial data...\n"); dt *image = (dt *) malloc( IMAGE_WIDTH * IMAGE_HEIGHT * sizeof(dt) ); GenerateRandomImage( image, IMAGE_WIDTH, IMAGE_HEIGHT, 0/*seed*/ ); // Allocate memory to store results. // Results are computed BIGTILE_SIZE halfwords at a time. const int BIGTILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE; dt *block_s = malloc( BIGTILE_SIZE * sizeof(dt) ); dt *block_v = (dt *) vbx_shared_malloc( BIGTILE_SIZE * sizeof(dt) ); dt *coeff_v = (dt *) vbx_shared_malloc( BIGTILE_SIZE * sizeof(dt) ); //Make an uncached 1D version of the coeff matrix for (i = 0; i < NUM_TILE_Y; i++) { // row for (j = 0; j < BLOCK_SIZE; j++) { // row for (k = 0; k < NUM_TILE_X; k++) { // col for (l = 0; l < BLOCK_SIZE; l++) { // col coeff_v[i*NUM_TILE_X*DCT_SIZE + j*DCT_SIZE + k*BLOCK_SIZE + l] = cs[j][l]; } } } } #ifdef DEBUG printf("input matrix is:\n"); for (i = 0; i < BLOCK_SIZE; i++) { base = i * BLOCK_SIZE; for (j = 0; j < BLOCK_SIZE; j++) { printf("%d ", (int) block_s[base + j]); } printf("\n"); } #endif printf("\nRunning DCT...\n"); time_start = vbx_timestamp(); for( y = 0; y < IMG_DOWN; y++ ) { for( x = 0; x < IMG_ACROSS; x++ ) { vbx_mtx_fdct_scalar( block_s, (dt*)cs, image, x/*start_x*/, y/*start_y*/, NUM_TILE_X, NUM_TILE_Y ); } } time_stop = vbx_timestamp(); cycles = time_stop - time_start; scalar_time = (double) cycles; scalar_time /= (double) vbx_timestamp_freq(); scalar_time *= 1000.0; //ms vbx_timestamp_t mxp_cycles = vbx_mxp_cycles(cycles); printf("%dx%d Block Size\n", BLOCK_SIZE, BLOCK_SIZE); printf("Finished, scalar CPU took %0.3f ms \n", scalar_time); printf(" CPU Cycles: %d\n", (int) mxp_cycles); printf(" CPU Cycles per block: %f\n", mxp_cycles / ((double) (NUM_BLOCKS))); vbx_sync(); // wait for image to be prefetched time_start = vbx_timestamp(); for( y = 0; y < IMG_DOWN; y++ ) { for( x = 0; x < IMG_ACROSS; x++ ) { vbx_mtx_fdct( v, block_v, image, x/*start_x*/, y/*start_y*/, IMG_ACROSS-1,IMG_DOWN-1,NUM_TILE_X, NUM_TILE_Y ); } } time_stop = vbx_timestamp(); cycles = time_stop - time_start; vbx_time = (double) cycles; vbx_time /= (double) vbx_timestamp_freq(); vbx_time *= 1000.0; //ms mxp_cycles = vbx_mxp_cycles(cycles); printf("Finished, MXP took %0.3f ms \n", vbx_time); printf(" CPU Cycles: %d\n", (int) mxp_cycles); printf(" CPU Cycles per block: %f\n", mxp_cycles / ((double) (NUM_BLOCKS))); printf(" Speedup: %f\n", scalar_time / vbx_time); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); double vbx_mbps = (double) (NUM_BLOCKS) * 1000 / vbx_time; // blocks per second printf("V%d@%dMHz: %dx%d tile, %dx%d blocks, %f blocks/s, %f megapixel/s\n", this_mxp->vector_lanes, this_mxp->core_freq / 1000000, NUM_TILE_Y, NUM_TILE_X, BLOCK_SIZE, BLOCK_SIZE, vbx_mbps, (vbx_mbps * DCT_SIZE) / 1000000); printf("\nChecking results...\n"); wrong = 0; for (block_num = 0; block_num < NUM_BLOCKS; block_num++) { for (i = 0; i < BLOCK_SIZE; i++) { base = i * BLOCK_SIZE; for (j = 0; j < BLOCK_SIZE; j++) { if (block_s[block_num * DCT_SIZE + base + j] != block_v[block_num * DCT_SIZE + base + j]) { if (wrong < 5) { printf("\nError at %d [%d,%d], result is %d, should be %d\n", block_num, i, j, (int) block_v[block_num * DCT_SIZE + base + j], (int) block_s[block_num * DCT_SIZE + base + j]); } wrong++; } } } } printf("wrong is %d\n\n", wrong); total_errors += wrong; free(block_s); vbx_shared_free(block_v); vbx_shared_free(coeff_v); vbx_mtx_fdct_free( v ); VBX_TEST_END(total_errors); return (0); }
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N ) { typedef vbx_mm_t vbx_sp_t; const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; // Catch when N is very small if( N<4 ) { unsigned int i = 0; while(i<N) { dst[N-i-1]=src[i]; i++; } return VBW_SUCCESS; } vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; unsigned int FREE_BYTES = vbx_sp_getfree(); // Catch when N is small enough that cached scalar does a better job if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){ unsigned int i; vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t)); vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t)); for( i=0; i<N; i++ ) { B[N-i-1]=A[i]; } vbx_dcache_flush(B,N*sizeof(vbx_mm_t)); return VBW_SUCCESS; } unsigned int NUM_LANES = this_mxp->vector_lanes; unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B); unsigned int tile_size_w = tile_size_b/4; unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T; unsigned int num_tiles = N / tile_size_t; unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B; unsigned int tile_part_t = N - num_tiles * tile_size_t; unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX; if(tile_part_t){ vbx_sp_push(); vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); #if !VBX_SKIP_ALL_CHECKS if( !v_0 || !v_1) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t)); vbw_vec_reverse(v_1, v_0, tile_part_t); vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t)); dst += tile_part_t; vbx_sp_pop(); } if(!num_tiles) { return VBW_SUCCESS; } vbx_sp_push(); vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B); vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) }; vbx_word_t *result; #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[0] || !v_scratch[1] || !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif src += (num_tiles - 1) * tile_size_t; if( tile_size_w <= threshold_w) { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); if(VBW_ROT16){ vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w); }else{ vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w); } if( VBW_ROT8){ vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 ); } vbx_dma_to_host( dst, v_scratch[1], tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } else { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B, rows_per_tile, VBW_ROT16 ); if(VBW_ROT8){ vec_rot8_h( result, result, tile_size_w*2 ); } vbx_dma_to_host( dst, result, tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } vbx_sp_pop(); return VBW_SUCCESS; }
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N ) { const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int NUM_LANES = this_mxp->vector_lanes; //printf("\n%d\n",VBX_SKIP_ALL_CHECKS); // Can the whole vector fit in the scratchpad width? if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){ vbx_set_vl( 1 ); vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+N-1, v_src); return VBW_SUCCESS; } unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX); unsigned int N_w = N >> VBW_RSHIFT_T_TO_W; // Equivalent number of words in the vector if( N_w && N_w <= threshold_w ) { if( VBW_ROT16){ // remainder of elements that can't add to a whole word unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T); if( stub_t ) { vbx_set_vl( 1 ); vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t); v_dst += stub_t; } vec_rev_rot16_w(v_dst, v_src, N_w); }else{ vec_rev_w(v_dst, v_src, N_w); } if( VBW_ROT8){ vec_rot8_h(v_dst, v_dst, N_w*2); } return VBW_SUCCESS; } const unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; const unsigned int FREE_BYTES = vbx_sp_getfree(); const unsigned int ODD_LOG_SEL = NUM_LANES & 0x55555555 ? 1 : 0; vbx_word_t *v_mask, *v_result; vbx_word_t *v_scratch[2] = {0,0}; unsigned int num_rows_w = N_w / NUM_LANES; unsigned int working_set_w = num_rows_w * NUM_LANES; unsigned int tail_t = N - (working_set_w << VBW_LSHIFT_W_TO_T); unsigned int remaining_w = working_set_w; if( tail_t ) { vbx_set_vl( 1 ); vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t); v_dst += tail_t; } vbx_word_t *v_src_w = (vbx_word_t *)v_src; vbx_word_t *v_dst_w = (vbx_word_t *)v_dst; if(!num_rows_w) { return VBW_SUCCESS; } remaining_w = working_set_w; while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) { if( remaining_w <= threshold_w*2 ) { if( VBW_ROT16){ vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w); }else{ vec_rev_w(v_dst_w, v_src_w, remaining_w); } if( VBW_ROT8){ vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2); } return VBW_SUCCESS; } working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES ); v_mask = v_dst_w + (working_set_w*2); remaining_w -= working_set_w; v_scratch[0] = v_dst_w; v_scratch[1] = v_dst_w + working_set_w; num_rows_w = working_set_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, working_set_w*2); } v_dst_w += working_set_w; } vbx_sp_push(); v_scratch[0] = v_dst_w; v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) ); #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[1] ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B ); #if !VBX_SKIP_ALL_CHECKS if( !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif num_rows_w = remaining_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, remaining_w*2); } vbx_sp_pop(); return VBW_SUCCESS; }
int main(void) { vbx_timestamp_t time_start, time_stop; double scalar_time, vector_time; input_pointer img1; input_pointer img2; input_pointer sc_img1; input_pointer sc_img2; output_pointer scalar_out; output_pointer vector_out; int i,j; int total_errors = 0; vbx_test_init(); vbx_mxp_print_params(); img1 = vbx_shared_malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type) ); img2 = vbx_shared_malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type) ); vector_out = vbx_shared_malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(output_type) ); sc_img1 = malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type) ); sc_img2 = malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type) ); scalar_out = malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(output_type) ); init_img( img1, img2 ); init_img( sc_img1, sc_img2 ); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_VECTOR_BYTE_LANES = this_mxp->vector_lanes * sizeof(int); printf("\n"); printf("Num of byte lanes: %d\n", VBX_VECTOR_BYTE_LANES); printf("Initialized data\n\n"); printf("Executing Scalar Image Blend...\n"); vbx_timestamp_start(); time_start = vbx_timestamp(); scalar_blend( scalar_out, sc_img1, sc_img2, NUM_OF_ROWS, NUM_OF_COLUMNS, CONST_BLEND ); time_stop = vbx_timestamp(); printf("Finished Scalar Image Blend\n"); scalar_time = vbx_print_scalar_time(time_start, time_stop); printf("\nExecuting Vector Image Blend...\n"); vbx_timestamp_start(); time_start = vbx_timestamp(); vector_blend( vector_out, img1, img2, NUM_OF_ROWS, NUM_OF_COLUMNS, CONST_BLEND); time_stop = vbx_timestamp(); printf("Finished Vector Image Blend\n"); vector_time = vbx_print_vector_time(time_start, time_stop, scalar_time); int errors = 0; for( j=0; j<NUM_OF_ROWS; j++ ) { for( i = 0; i < NUM_OF_COLUMNS; i++ ) { if( vector_out[j*NUM_OF_COLUMNS+i] != scalar_out[j*NUM_OF_COLUMNS+i] ) { if(errors < 5) printf( "\nFail at sample [%3d,%3d]. Scalar: %3d Vector: %3d Img1: %3d Img2: %3d", j, i, scalar_out[j*NUM_OF_COLUMNS+i], vector_out[j*NUM_OF_COLUMNS+i], img1[j*NUM_OF_COLUMNS+i], img2[j*NUM_OF_COLUMNS+i] ); errors++; } } } printf("\n%d errors\n", errors); total_errors += errors; VBX_TEST_END(total_errors); return 0; }
int main(void) { vbx_test_init(); #if 0 vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; int N = VBX_SCRATCHPAD_SIZE/sizeof(vbx_mm_t)/8; #endif int TEST_LENGTH = TEST_ROWS*TEST_COLS; int NTAP_LENGTH = NTAP_ROWS*NTAP_COLS; int PRINT_COLS = min( TEST_COLS, MAX_PRINT_LENGTH ); int PRINT_ROWS = min( TEST_ROWS, MAX_PRINT_LENGTH ); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf( "\nMatrix FIR test...\n" ); printf( "Matrix dimensions: %d,%d\n", TEST_ROWS, TEST_COLS ); vbx_mm_t *scalar_in = malloc( TEST_LENGTH*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in = vbx_shared_malloc( TEST_LENGTH*sizeof(vbx_mm_t) ); int32_t *scalar_filt = malloc( NTAP_LENGTH*sizeof(int32_t) ); int32_t *vector_filt = vbx_shared_malloc( NTAP_LENGTH*sizeof(int32_t) ); vbx_mm_t *scalar_out = malloc( TEST_LENGTH*sizeof(vbx_mm_t) ); vbx_mm_t *vector_out = vbx_shared_malloc( TEST_LENGTH*sizeof(vbx_mm_t) ); VBX_T(test_zero_array)( scalar_out, TEST_LENGTH ); VBX_T(test_zero_array)( vector_out, TEST_LENGTH ); VBX_T(test_init_array)( scalar_in, TEST_LENGTH, 1 ); VBX_T(test_copy_array)( vector_in, scalar_in, TEST_LENGTH ); test_init_array_word( scalar_filt, NTAP_LENGTH, 1 ); test_copy_array_word( vector_filt, scalar_filt, NTAP_LENGTH ); VBX_T(test_print_matrix)( scalar_in, PRINT_ROWS, PRINT_COLS, TEST_COLS ); test_print_matrix_word( scalar_filt, NTAP_ROWS, NTAP_COLS, NTAP_COLS ); scalar_time = test_scalar( scalar_out, scalar_in, scalar_filt, TEST_ROWS, TEST_COLS, NTAP_ROWS, NTAP_COLS); VBX_T(test_print_matrix)( scalar_out, PRINT_COLS, PRINT_ROWS, TEST_COLS ); vector_time = test_vector( vector_out, vector_in, vector_filt, TEST_ROWS, TEST_COLS, NTAP_ROWS, NTAP_COLS, scalar_time ); VBX_T(test_print_matrix)( vector_out, PRINT_COLS, PRINT_ROWS, TEST_COLS ); int i; for(i=0; i<TEST_ROWS-NTAP_ROWS; i++){ errors += VBX_T(test_verify_array)( scalar_out+i*TEST_COLS, vector_out+i*TEST_COLS, TEST_COLS-NTAP_COLS ); } VBX_TEST_END(errors); return 0; }
int vbw_mtx_median_ext_argb32( unsigned *output, unsigned *input, const int filter_height, const int filter_width, const int image_height, const int image_width, const int image_pitch ) { const int FREE_BYTES = vbx_sp_getfree(); int l,k; int filter_mid, filter_size; int rows_per_l,vl,temp_vl, temp_vl_byte; int j,i; int partial_row = 0; filter_size = filter_height*filter_width; filter_mid = filter_size/2; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_WIDTH_BYTES = this_mxp->scratchpad_alignment_bytes; // Could possibly check for low SP here (less than 6*VBX_WIDTH_BYTES) and assign vl differently // During allocation, max additional SP bytes needed due to alignment is one VBX_WIDTH_BYTES per vector // Taking that off the top simplifies calculation and will always be correct, but sacrifices a little SP space vl = (FREE_BYTES-3*VBX_WIDTH_BYTES)/((filter_size+2)*sizeof(vbx_uword_t)); if( vl < 1 ) { return VBW_ERROR_SP_ALLOC_FAILED; } if(vl < image_width){ rows_per_l = 1; partial_row = 1; } else { rows_per_l = vl/image_width; vl = image_width*rows_per_l; } vbx_sp_push(); vbx_uword_t *v_input = (vbx_uword_t *)vbx_sp_malloc(filter_size*vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_sub = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_temp = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t)); vbx_ubyte_t *v_min, *v_max; vbx_ubyte_t *v_input_byte = (vbx_ubyte_t *)v_input; if( v_temp == NULL ){ vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } for(l = 0; l < image_height-filter_height; l+= rows_per_l){ // detect last pass if(l+rows_per_l > image_height-filter_height){ rows_per_l = (image_height-filter_height)-l; vl = image_width*rows_per_l; } temp_vl = vl; for(k = 0; k < image_width; k += temp_vl){ if(partial_row){ if(k + temp_vl > image_width){ temp_vl = image_width - k; } } for(j = 0; j < filter_height; j++){ vbx_dma_to_vector_2D(v_input+temp_vl*j, input+(l+j)*image_pitch+k, temp_vl/rows_per_l*sizeof(vbx_uword_t), rows_per_l, image_width*sizeof(vbx_uword_t), image_pitch*sizeof(vbx_uword_t)); } // arrange all pixels within a filter window into single columns, seperated by temp_vl // // ex. vl = 5, filter = 3 // vinput before vinput after // // a00 a01 a02 a03 a04 | a00 a01 a02 a03 a04 | // a10 a11 a12 a13 a14 | a10 a11 a12 a13 a14 | // a20 a21 a22 a23 a24 | a20 a21 a22 a23 a24 | // ??? ??? ??? ??? ??? | a01 a02 a03 a04 a10 | // ??? ??? ??? ??? ??? | a11 a12 a13 a14 a20 | // ??? ??? ??? ??? ??? | a21 a22 a23 a24 a30 | // ??? ??? ??? ??? ??? | a02 a03 a04 a10 a11 | // ??? ??? ??? ??? ??? | a12 a13 a14 a20 a21 | // ??? ??? ??? ??? ??? | a22 a23 a24 a30 a31 | // vbx_set_vl(temp_vl); for(j = 1; j < filter_height; j++){ for(i = 0; i < filter_width; i++){ vbx(VVWU, VMOV, v_input+(j*filter_height+i)*temp_vl, v_input+i*temp_vl+j, 0); } } //Do the bubble sort up to the filter_size/2^th element on each vbx // work on individual color channels temp_vl_byte = temp_vl*sizeof(vbx_uword_t)/sizeof(vbx_ubyte_t); vbx_set_vl(temp_vl_byte); // sort lower half of the values in the window for(j = 0; j < filter_mid; j++){ v_min = v_input_byte+j*temp_vl_byte; for(i = j+1; i < filter_size; i++){ v_max = v_input_byte+i*temp_vl_byte; vbx(VVBU, VMOV, v_temp, v_min, 0); vbx(VVBU, VSUB, v_sub, v_max, v_min); vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub); vbx(VVBU, VCMV_LTZ, v_max, v_temp, v_sub); } } // grab next smallest value, the median, don't sort the rest v_min = v_input_byte+filter_mid*temp_vl_byte; for(i = filter_mid+1; i < filter_size; i++){ v_max = v_input_byte+i*temp_vl_byte; vbx(VVBU, VSUB, v_sub, v_max, v_min); vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub); } // dma out median value // back to pixels vbx_dma_to_host_2D(output+(l*image_pitch)+k, v_input+temp_vl*filter_mid, temp_vl/rows_per_l*sizeof(vbx_uword_t), rows_per_l, image_pitch*sizeof(vbx_uword_t), image_width*sizeof(vbx_uword_t)); } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }
int main(void) { vbx_timestamp_t time_start, time_stop; double scalar_time, vbx_time, vbx_time_masked; int i, j, k, l, m, n; int errors = 0; vbx_test_init(); vbx_mxp_print_params(); pixel *input, *scalar_input, *vbx_input, *vbx_input_masked; uint16_t *scalar_short; input = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); scalar_input = (pixel *)vbx_remap_cached(input, IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); scalar_short = (uint16_t *)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(uint16_t)); vbx_input = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); vbx_input_masked = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); #if UNIT unsigned char *vbx_img8; unsigned short *img, *vbx_img; unsigned int *iImg, *vbx_iImg; unsigned int *iiImg, *vbx_iiImg; img = (unsigned short*)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned short)); vbx_img = (unsigned short*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned short)); vbx_img8 = (unsigned char*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned char)); iImg = (unsigned int*)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int)); vbx_iImg = (unsigned int*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int)); iiImg = (unsigned int*)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int)); vbx_iiImg = (unsigned int*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int)); #endif//UNIT printf("Resolution = %dx%d\n", IMAGE_WIDTH, IMAGE_HEIGHT); printf("Initializing data\n"); vbx_timestamp_start(); for(l = 0; l < 1; l++){ char *src; char *sdst; char *vdst; char *mdst; if(l == 0){ load_lenna(input, IMAGE_WIDTH, IMAGE_HEIGHT); load_lenna(vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT); load_lenna(vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT); printf("\nLenna\n"); src = "lenna"; sdst = "s_lenna"; vdst = "v_lenna"; mdst = "m_lenna"; }else if(l == 1){ load_ms(input, IMAGE_WIDTH, IMAGE_HEIGHT); load_ms(vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT); load_ms(vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT); printf("\nMicrosoft\n"); src = "ms"; sdst = "s_ms"; vdst = "v_ms"; mdst = "m_ms"; }else if(l == 2){ load_blank(input, IMAGE_WIDTH, IMAGE_HEIGHT); load_blank(vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT); load_blank(vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT); printf("\nblank\n"); src = "blank"; sdst = "s_blank"; vdst = "v_blank"; mdst = "m_blank"; } #if UNIT int window = 20; int log=0; while(((window/3)>>log) >= 2) log++; errors += compare_scalar_rgb2luma_to_vbw_rgb2luma16(img, vbx_img, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, MAX_PRINT_ERRORS); vbw_rgb2luma8(vbx_img8, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH); int s; #if LUT_CI #if DOUBLE_LUT printf("Testing double lut\n"); printf("Assign lbp double lut\n"); assign_lbp_lut_ci2(); int prev = errors; printf("Cascade check\n"); /* errors += cascade_check_2w(face_lbp, face_lbp_max_stage, 256); */ /* errors += cascade_check_2h(face_lbp, face_lbp_max_stage, 256); */ errors += cascade_check_2b(face_lbp, face_lbp_max_stage, 256); if (errors) { printf("errors %d\n", errors-prev); } #else assign_lbp_lut_ci(); printf("Testing cascade\n"); int prev = errors; printf("lut check\n"); #if 0 #if 0 errors += lut_check(256, 0, 0, 0); if (errors) { printf("errors %d\n", errors-prev); } #elif 1 int print_errors = 0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int vci_lanes = this_mxp->vcustom0_lanes; int num_features = cascade_max_feature(); int input_length = 10; int lut_length = num_features*vci_lanes; int lut_iterations = 15; #if 1 lut_length = input_length = 128; lut_iterations = 13; print_errors = 0; errors += lut_check2(input_length, lut_length, lut_iterations, print_errors); if (errors) { printf("errors %d\n", errors-prev); } #elif 1 input_length = 64; lut_length = input_length; lut_iterations = 13; print_errors = 1; errors += lut_check2(input_length, lut_length, lut_iterations, print_errors); if (errors) { printf("errors %d\n", errors-prev); } #else for(s = 2; s < 100; s=s+10){ errors += lut_check2(s, lut_length, lut_iterations, print_errors); if (errors - prev > 0) { printf("%d\terrors %d\n", s, errors-prev); } else { printf("%d\n", s); } prev = errors; } #endif #else for(s = 0; s < 2000; s=s+100){ errors += lut_check(s, 0, 0, 0); if (errors - prev > 0) { printf("%d\terrors %d\n", s, errors-prev); } else { printf("%d\n", s); } prev = errors; } #endif #elif 1 #else printf("check cascade\n"); prev = errors; errors += cascade_check(face_lbp, face_lbp_max_stage, 256); if (errors) { printf("errors %d\n", errors-prev); } printf("Testing LBP LUT CI\n"); prev = errors; for(s = 0; s < face_lbp_max_stage; s++){ errors += compare_vbx_lut_to_vbx_lut_ci(s, MAX_PRINT_ERRORS); } if (errors) { printf("errors %d\n", errors-prev); prev = errors; } #endif #endif #endif #if 0 printf("Printing grey scale img\n"); printf("grey = ["); for (j = 0; j < IMAGE_HEIGHT; j++) { printf("["); for (i = 0; i < IMAGE_WIDTH; i++) { printf("%d, ", vbx_img8[j*IMAGE_WIDTH+i]); } printf("],\n"); } printf("]\n"); #endif #if LBP_CI printf("Testing LBP Pattern CI\n"); errors += compare_LBPRestrictedCI_to_test_scalar_patterns(vbx_img, vbx_img8, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); #endif #if BLIP printf("Testing BLIP\n"); for(s = 1; s < 10; s++){ errors += compare_scalar_BLIP2_to_vector_BLIP(img, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS, s); } #endif #if 0 errors += compare_LBPRestrictedSums_to_test_scalar_sums_byte(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_LBPRestrictedSums2_to_test_scalar_sums_half(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_ScalarLBPRestrictedSums_to_test_scalar_sums_half(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_ScalarLBPRestrictedPatterns_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_LBPRestrictedPatterns2_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_LBPRestricted_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); /* overflow issues -- using bytes changes lbp pattern */ errors += compare_LBPRestrictedPatterns_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); /* requires SKIP_INTEGRALS 0 */ errors += compare_gen_integrals_to_vector_get_img(img, iImg, iiImg, vbx_img, vbx_iImg, vbx_iiImg, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); /* redundant test, compare to test_scalar_patterns instead */ errors += compare_ScalarLBPRestrictedPatterns_to_SATBinaryPattern(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_SATBinaryPattern_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); errors += compare_LBPPassStage_to_restricted(vbx_img, log, face_lbp[0], window, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS); #endif #else // UNIT #if PRINT print_python_pixel(scalar_input, src, IMAGE_WIDTH, IMAGE_HEIGHT); #endif time_start = vbx_timestamp(); scalar_rgb2luma(scalar_short, input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH); scalar_face_detect_luma(scalar_short, input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, sdst); time_stop = vbx_timestamp(); scalar_time = vbx_print_scalar_time(time_start, time_stop); #if PRINT print_python_pixel(scalar_input, sdst, IMAGE_WIDTH, IMAGE_HEIGHT); #endif printf("\nVector"); time_start = vbx_timestamp(); vector_face_detect((pixel *)vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, 0, vdst); time_stop = vbx_timestamp(); vbx_time = vbx_print_vector_time(time_start, time_stop, scalar_time); #if PRINT print_python_pixel(vbx_input, vdst, IMAGE_WIDTH, IMAGE_HEIGHT); #endif printf("\nVector Masked"); time_start = vbx_timestamp(); vector_face_detect((pixel *)vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, 1, mdst); time_stop = vbx_timestamp(); vbx_time_masked = vbx_print_vector_time(time_start, time_stop, scalar_time); #if PRINT print_python_pixel(vbx_input_masked, mdst, IMAGE_WIDTH, IMAGE_HEIGHT); #endif /* errors += match_array_pixel(input, vbx_input, "vector", IMAGE_WIDTH, IMAGE_HEIGHT, 0, MAX_PRINT_ERRORS, 0); */ /* errors += match_array_pixel(input, vbx_input_masked, "masked", IMAGE_WIDTH, IMAGE_HEIGHT, 0, MAX_PRINT_ERRORS, 0); */ errors += match_array_pixel(vbx_input, vbx_input_masked, "masked", IMAGE_WIDTH, IMAGE_HEIGHT, 0, MAX_PRINT_ERRORS, 0); #endif // UNIT } VBX_TEST_END(errors); return errors; }
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS ) { typedef vbx_mm_t vbx_sp_t; int elements = INROWS * INCOLS; if(elements < SCALAR_THRESHOLD) { vbx_sync(); //in case we input is waiting on a DMA transfer int i,j; for(i = 0; i < INROWS; i++) { for(j = 0; j < INCOLS; j++) { out[j*INROWS+i] = in[i*INCOLS+j]; } } return VBW_SUCCESS; } vbx_sp_push(); vbx_sp_t *v_in; vbx_sp_t *v_out; int tile_height = 0; int tile_width = 0; int prev_tile_width = 0; int tile_y = 0; int tile_x = 0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; int SP_SIZE = vbx_sp_getfree(); int max_sp_elements = vbx_sp_getfree() / sizeof(vbx_sp_t); int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t); if( INROWS == 1 || INCOLS == 1 ) { // 1D transpose becomes a simple copy operation if( elements <= max_sp_elements ) { // We can use the whole scratchpad for this v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // To test this, you'll need a very large 1D matrix (or a small SP) tile_width = max_sp_elements; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) ); for (tile_x = 0; tile_x < elements; tile_x += tile_width) { if( tile_x + tile_width > elements) tile_width = elements - tile_x; vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) ); v_out = v_in; vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) ); } } } else if( elements < max_tile_elements ) { // Matrix is small enough to handle entirely in SP v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) ); vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) ); vbw_mtx_xp(v_out,v_in,INROWS,INCOLS); vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) ); } else { // At this point we know at least one full tile will be needed #define QUICK_A_LANES_THRESHOLD 8 // Use merge transpose if there are at least this many lanes #define QUICK_A_TILE_WIDTH 128 #define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH) #define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2) #define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t))) #define QUICK_B_LANES_THRESHOLD 16 // Use smaller merge transpose tile only if there are a lot of lanes #define QUICK_B_TILE_WIDTH 64 // and only if larger tile A size cannot be used. #define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH) #define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2) #define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t))) int NUM_LANES = this_mxp->vector_lanes; int DMA_BYTES = this_mxp->dma_alignment_bytes; int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t); vbx_sp_t *v_out_sel; vbx_sp_t *vf = 0; if( NUM_LANES >= QUICK_A_LANES_THRESHOLD // Check for appropriate conditions to use merge transpose tiles && INCOLS >= QUICK_A_TILE_WIDTH && INROWS >= QUICK_A_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_A_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t)); } else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD && INCOLS >= QUICK_B_TILE_WIDTH && INROWS >= QUICK_B_TILE_WIDTH && (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) { tile_width = tile_height = QUICK_B_TILE_WIDTH; vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t)); } else { findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim ); } prev_tile_width = tile_width; v_in = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) ); if( v_out==NULL ) { vbx_sp_pop(); return VBW_ERROR_SP_ALLOC_FAILED; } vbx_sp_t *v[2] = { v_in, v_out }; tile_y = 0; // Reset y position for new col while( tile_y < INROWS ) { vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); tile_x = 0; // Reset x position for new row while( tile_x < INCOLS ) { vbx_dma_to_vector_2D( v_in, in+(tile_y*INCOLS)+tile_x, tile_width*sizeof(vbx_mm_t), tile_height, tile_width*sizeof(vbx_sp_t), INCOLS*sizeof(vbx_mm_t) ); v_out_sel = v_out; // select v_out as default vector to DMA to MM /* *** merge transpose (matrix must be square and a power of 2 wide) *** */ if( vf && tile_width == tile_height && (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) { int src = 0; int n; for( n=1; n<tile_width; n *= 2 ) { // can't do 1st iteration until entire tile is DMA'd in const int nn = 2*n; // copy the destination matrix vbx_set_vl( tile_width*tile_width ); // use v_in & v_out as working matrices (clobber v_in) vbxx( VMOV, v[!src], v[src]); // do the work vbx_set_vl( n*tile_width ); vbxx( VAND, vf, n, (vbx_enum_t*)0 ); // mask for merging: 0101010... then 00110011... vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 ); vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n , vf ); vbxx_2D( VCMV_Z, v[!src]+n, v[src]+n*tile_width, vf ); src = !src; } v_out_sel = v[src]; // depending on the size of the mtx, the final result may be in v_in or v_out } else { vbx_set_vl( 1 ); // 2D and 3D will be set by the x and y edge conditions, even using merge vbxx_3D(VMOV, v_out, v_in ); } vbx_dma_to_host_2D( out+(tile_x*INROWS)+tile_y, v_out_sel, tile_height*sizeof(vbx_mm_t), tile_width, INROWS*sizeof(vbx_mm_t), tile_height*sizeof(vbx_sp_t) ); tile_x += tile_width; // Set up width for next tile if( tile_x + tile_width > INCOLS ) { // Temporarily reduce tile width when reaching right edge of matrix tile_width = INCOLS - tile_x; vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) ); vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) ); } } tile_y += tile_height; // Set up width and height for next row of tiles tile_width = prev_tile_width; // Restore original tile width for next row of tiles /* *** Permanently reduce tile height when reaching bottom of matrix *** */ tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height; } } vbx_sp_pop(); vbx_sync(); return VBW_SUCCESS; }