vbx_mtx_fdct_t * vbx_mtx_fdct_init( dt *coeff_v, dt *image ) { const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE; const int num_bytes = BIG_TILE_SIZE * sizeof(dt); const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt); //compute coeffs matrix in double and truncated to dt int i, j; double s; for (i = 0; i < BLOCK_SIZE; i++) { s = (i == 0) ? sqrt(0.125) : 0.5; for (j = 0; j < BLOCK_SIZE; j++) { c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5)); cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999); } } vbx_sp_push(); vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) ); v->vcoeff = (vbx_half_t *)vbx_sp_malloc( co_bytes ); v->vprods = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #if USE_ACCUM_FLAGS v->vaccum = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vflags = (vbx_half_t *)vbx_sp_malloc( num_bytes ); #endif // interleave ordering to ensure no false hazards v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes ); if( !v->vblock[1] ) { VBX_PRINTF( "ERROR: out of memory.\n" ); VBX_EXIT(-1); } vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes ); int row; for( row=0; row < BLOCK_SIZE; row++ ) { getBigTileImageY(v->vimage[v->db],image,row); } #if USE_ACCUM_FLAGS // create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) ); vbx( SEH, VAND, v->vflags, BLOCK_SIZE-1, 0 ); #endif return v; }
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*. * This function uses a merge reverse algorithm that is faster on large vectors. * @pre v_src contains the elements to reverse. * @pre v_src, v_scratch0, and v_scratch1 must all be the same length. * @pre v_scratch1 and v_src must not overlap. * @pre v_src *may* overlap v_scratch0 (will clobber v_src). * @pre MXP must be 2 lanes or more. * @pre N is a multiple of SP_WIDTH_B. * @pre NUM_ROWS == N*4 / SP_WIDTH_B. * @pre v_mask must be SP_WIDTH_B bytes long. * @post v_scratch0 and v_scratch1 contents are modified, with one containing the result. * @post v_src clobbered only if v_src overlaps v_scratch0. * * @param[in] v_scratch1 *in scratch*. * @param[in] v_src *in scratch*. * @param[in] N is the number of words to reverse. * @param[in] v_scratch0 *in scratch*. * @param[in] v_mask *in scratch*. * @param[in] SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time. * @param[in] NUM_ROWS is the number of rows of length SP_WIDTH_B bytes. * @param[in] rot16 TRUE to swap upper and lower half-words of each word in result. * @returns the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1, * and will depend on log2(MXP vector lanes). */ static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0, vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 ) { #if !VBX_SKIP_ALL_CHECKS if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) { VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short."); VBX_EXIT(-1); } #endif vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 }; unsigned int W = SP_WIDTH_B/4/2; // half the number of words in a row unsigned int sel = 1; if( rot16 ) { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src ); vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) ); } else { vbx_set_vl( W ); vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 ); vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 ); } vbx_set_vl( SP_WIDTH_B/4 ); vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 ); while( W > 1 ) { // set up odd/even mask register W /= 2; vbx( SEW, VAND, v_mask, W, 0 ); vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask ); vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask ); sel = !sel; } return v_scratch[sel]; }
int VBX_T(vbw_vec_reverse_test)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int i,k; vbx_timestamp_t start=0,finish=0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); NBYTES = sizeof(vbx_sp_t)*N; if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue; vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES ); vbx_sp_t *vdst = vbx_sp_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !vsrc ) VBX_EXIT(-1); if( !vdst ) VBX_EXIT(-1); #if ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF ) unsigned int mask = 0x007F; #elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF ) unsigned int mask = 0x7FFF; #else unsigned int mask = 0xFFFF; #endif vbx_set_vl( N ); vbx( SV(T), VMOV, vdst, -1, 0 ); // Fill the destination vector with -1 vbx( SE(T), VAND, vsrc, mask, 0 ); // Fill the source vector with enumerated values //VBX_T(print_vector)( "vsrcInit", vsrc, N ); //VBX_T(print_vector)( "vdstInit", vdst, N ); /** measure performance of function call **/ vbx_sync(); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N ); vbx_sync(); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); //VBX_T(print_vector)( "vsrcPost", vsrc, N ); //VBX_T(print_vector)( "vdstPost", vdst, N ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); vbx_set_vl( N ); vbx( SE(T), VAND, vsrc, mask, 0 ); // Reset the source vector /** measure performance of simple algorithm **/ vbx_sync(); vbx_set_vl( 1 ); vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 ); vbx_sync(); } finish = vbx_timestamp(); printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_sp_free(); } vbx_sp_free(); printf("All tests passed successfully.\n"); return 0; }
int VBX_T(vbw_vec_reverse_test_mm)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536, 65537, 100000, 128000, 256000, 333333, 528374, 528374 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int NREPSFORLARGE = 10; unsigned int i,j,k; vbx_timestamp_t start=0,finish=0; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); if(N > 10000) NREPS = NREPSFORLARGE; NBYTES = N*sizeof(vbx_mm_t); vbx_mm_t *src = (vbx_mm_t *) vbx_shared_malloc( NBYTES ); vbx_mm_t *dst = (vbx_mm_t *) vbx_shared_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !src ) VBX_EXIT(-1); if( !dst ) VBX_EXIT(-1); for ( j=0; j<N; j++ ) { dst[j] = -1; // Fill the destination with -1 src[j] = j; // Fill the source with enumerated values } // VBX_T(vbw_vec_reverse_ext)( dst, src, N ); /** measure performance of function call **/ start = vbx_timestamp(); for(k=0; __builtin_expect(k<NREPS,1); k++ ) { retval = VBX_T(vbw_vec_reverse_ext)( dst, src, N ); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware mm f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( src, dst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); /** measure performance of scalar **/ vbx_mm_t *A = vbx_remap_cached( src, N*sizeof(vbx_mm_t) ); // Use cached pointers for better performance vbx_mm_t *B = vbx_remap_cached( dst, N*sizeof(vbx_mm_t) ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { unsigned int m; for(m=0; m<N; m++) { B[N-1-m]=A[m]; } vbx_dcache_flush( A, N*sizeof(vbx_mm_t) ); // Make sure to read from main memory vbx_dcache_flush( B, N*sizeof(vbx_mm_t) ); // Make sure writes are committed to memory } finish = vbx_timestamp(); printf( "\tscalar (cache friendly):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( src, dst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_shared_free(src); vbx_shared_free(dst); } printf("All tests passed successfully.\n"); return 0; }
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N ) { typedef vbx_mm_t vbx_sp_t; const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; // Catch when N is very small if( N<4 ) { unsigned int i = 0; while(i<N) { dst[N-i-1]=src[i]; i++; } return VBW_SUCCESS; } vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; unsigned int FREE_BYTES = vbx_sp_getfree(); // Catch when N is small enough that cached scalar does a better job if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){ unsigned int i; vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t)); vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t)); for( i=0; i<N; i++ ) { B[N-i-1]=A[i]; } vbx_dcache_flush(B,N*sizeof(vbx_mm_t)); return VBW_SUCCESS; } unsigned int NUM_LANES = this_mxp->vector_lanes; unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B); unsigned int tile_size_w = tile_size_b/4; unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T; unsigned int num_tiles = N / tile_size_t; unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B; unsigned int tile_part_t = N - num_tiles * tile_size_t; unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX; if(tile_part_t){ vbx_sp_push(); vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t)); #if !VBX_SKIP_ALL_CHECKS if( !v_0 || !v_1) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t)); vbw_vec_reverse(v_1, v_0, tile_part_t); vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t)); dst += tile_part_t; vbx_sp_pop(); } if(!num_tiles) { return VBW_SUCCESS; } vbx_sp_push(); vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B); vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) }; vbx_word_t *result; #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[0] || !v_scratch[1] || !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif src += (num_tiles - 1) * tile_size_t; if( tile_size_w <= threshold_w) { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); if(VBW_ROT16){ vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w); }else{ vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w); } if( VBW_ROT8){ vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 ); } vbx_dma_to_host( dst, v_scratch[1], tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } else { while( num_tiles ) { vbx_dma_to_vector( v_scratch[0], src, tile_size_b ); result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B, rows_per_tile, VBW_ROT16 ); if(VBW_ROT8){ vec_rot8_h( result, result, tile_size_w*2 ); } vbx_dma_to_host( dst, result, tile_size_b ); dst += tile_size_t; src -= tile_size_t; num_tiles--; } } vbx_sp_pop(); return VBW_SUCCESS; }
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N ) { const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t); const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t); const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0: sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2); const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int NUM_LANES = this_mxp->vector_lanes; //printf("\n%d\n",VBX_SKIP_ALL_CHECKS); // Can the whole vector fit in the scratchpad width? if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){ vbx_set_vl( 1 ); vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+N-1, v_src); return VBW_SUCCESS; } unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP : NUM_LANES == 16 ? VL1_THRESHOLD_V16 : NUM_LANES == 8 ? VL1_THRESHOLD_V8 : UINT_MAX); unsigned int N_w = N >> VBW_RSHIFT_T_TO_W; // Equivalent number of words in the vector if( N_w && N_w <= threshold_w ) { if( VBW_ROT16){ // remainder of elements that can't add to a whole word unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T); if( stub_t ) { vbx_set_vl( 1 ); vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t); v_dst += stub_t; } vec_rev_rot16_w(v_dst, v_src, N_w); }else{ vec_rev_w(v_dst, v_src, N_w); } if( VBW_ROT8){ vec_rot8_h(v_dst, v_dst, N_w*2); } return VBW_SUCCESS; } const unsigned int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes; const unsigned int FREE_BYTES = vbx_sp_getfree(); const unsigned int ODD_LOG_SEL = NUM_LANES & 0x55555555 ? 1 : 0; vbx_word_t *v_mask, *v_result; vbx_word_t *v_scratch[2] = {0,0}; unsigned int num_rows_w = N_w / NUM_LANES; unsigned int working_set_w = num_rows_w * NUM_LANES; unsigned int tail_t = N - (working_set_w << VBW_LSHIFT_W_TO_T); unsigned int remaining_w = working_set_w; if( tail_t ) { vbx_set_vl( 1 ); vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t); v_dst += tail_t; } vbx_word_t *v_src_w = (vbx_word_t *)v_src; vbx_word_t *v_dst_w = (vbx_word_t *)v_dst; if(!num_rows_w) { return VBW_SUCCESS; } remaining_w = working_set_w; while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) { if( remaining_w <= threshold_w*2 ) { if( VBW_ROT16){ vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w); }else{ vec_rev_w(v_dst_w, v_src_w, remaining_w); } if( VBW_ROT8){ vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2); } return VBW_SUCCESS; } working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES ); v_mask = v_dst_w + (working_set_w*2); remaining_w -= working_set_w; v_scratch[0] = v_dst_w; v_scratch[1] = v_dst_w + working_set_w; num_rows_w = working_set_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, working_set_w*2); } v_dst_w += working_set_w; } vbx_sp_push(); v_scratch[0] = v_dst_w; v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) ); #if !VBX_SKIP_ALL_CHECKS if( !v_scratch[1] ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B ); #if !VBX_SKIP_ALL_CHECKS if( !v_mask ) { VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space."); VBX_EXIT(-1); } #endif num_rows_w = remaining_w / NUM_LANES; v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 ); #if !VBX_SKIP_ALL_CHECKS if( v_result != v_dst_w ) { VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES."); VBX_EXIT(-1); } #endif if( VBW_ROT8){ vec_rot8_h(v_result, v_result, remaining_w*2); } vbx_sp_pop(); return VBW_SUCCESS; }
/* takes in precomputed bfly */ static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real) { int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num; short wr, wi; vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp; vptr_half v_twr, v_twi; vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip; vptr_half v_temp; vptr_half v_twr2, v_twi2; const int n = 1 << m; const int half = n >> 1; scale = 0; mul = 0; swap = m >> 1; l = m-1; flight = 1; bfly = half; const int INROWS = 1<<swap; const int INCOLS = 1<<(m-swap); if ( !(m%2) ){ swap--; } // allocate space in vector memory for vectors v_fr = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); v_twr = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_temp = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) ); if( v_fr == NULL || v_fi == NULL || v_fr2 == NULL || v_fi2== NULL || \ v_twr == NULL || v_twi == NULL || v_temp == NULL) { VBX_EXIT(-1); } v_twr2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); v_twi2 = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) ); if( v_twr2 == NULL || v_twi2 == NULL) { VBX_EXIT(-1); } vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) ); vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) ); #if 1 if(real){ vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse); } #endif while (l > swap) { if (inverse) { // variable scaling, depending upon data shift = 0; if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) { shift = 1; scale++; } } else { // fixed scaling, for proper normalization // -- overall factor of 1/n, distributed to maximize arithmetic accuracy shift = 1; } // shift will be performed on each data point exactly once during pass SWAP( v_fr, v_fr2, v_tmp ); SWAP( v_fi, v_fi2, v_tmp ); if (shift){ vbx_set_vl( n ); vbx(SVH,VSHR, v_fr2, 1, v_fr2 ); vbx(SVH,VSHR, v_fi2, 1, v_fi2 ); } vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) ); vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) ); vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fr[n>>1], v_fr2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fi2, v_twi ); vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp ); vbx_set_vl( 1<<l ); vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 ); vbx_2D( VVH, VMULFXP, &v_fi[n>>1], v_fi2, v_twr ); vbx_2D( VVH, VMULFXP, v_temp, v_fr2, v_twi ); vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) ); vbx( VVH, VADD, &v_fi[n>>1], &v_fi[n>>1], v_temp ); l--; mul++; flight <<= 1; if( l > swap ) { vbx_set_vl( 1<<l ); vbx( VVWH, VMOV, v_twr, v_twr, 0 ); vbx( VVWH, VMOV, v_twi, v_twi, 0 ); } } if ( !(m%2) ) { l++; flight >>=1; }
//vector version of rgb converter void vector_blend( output_pointer img_out, input_pointer img_in1, input_pointer img_in2, unsigned int num_row, unsigned int num_column, intermediate_type blending_const ) { intermediate_type *v_img1[2]; input_type *v_img2[2]; intermediate_type *v_temp; intermediate_type blending_const_bar = 256-blending_const; int j; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int VBX_WIDTH_BYTES = this_mxp->vector_lanes * sizeof(int); const int VBX_DMA_ALIGNMENT = this_mxp->dma_alignment_bytes; unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type))); chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT ); unsigned int chunk_size_old = chunk_size; unsigned int vector_length = chunk_size; unsigned int vector_length_old = vector_length; v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img2[0] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_img2[1] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_temp = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); if( v_temp == NULL ) { VBX_EXIT(0xBADDEAD); } int bufselect = 0; vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) ); for( j=0; j<num_row*num_column; j+=vector_length_old ) { vbx_set_vl(vector_length); if( j > 0 ) { vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) ); } if( (j+vector_length_old) < (num_row*num_column-1) ) { if( (j+vector_length_old*2) >= num_row*num_column ) { vector_length = num_row*num_column - j - vector_length_old; chunk_size = vector_length; } vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) ); } vbx( SVBHU, VMULLO, v_temp, blending_const, v_img1[bufselect] ); vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] ); vbx( VVHU, VADD, v_img1[bufselect], v_img1[bufselect], v_temp ); vbx( SVHBU, VSHR, v_img1[bufselect], 8, v_img1[bufselect] ); bufselect = 1-bufselect; } vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) ); vbx_sp_free(); vbx_sync(); }