Пример #1
0
/** VBX Motion Estimation, using vbx_3d ops.
 * vbw_mtx_motest_3D_byte_setup should be run prior to running this function.
 * Using bytes as input data. block_height must be an even number.
 *
 * @param[out] result
 * @param[in] x
 * @param[in] y
 * @param[in] m
 * @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_3d_byte(output_type *result, input_type* x, input_type *y, vbw_motest_t *m)
{

	int  l,j;
	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*m->block_width, x+j*m->image_width, m->block_width*sizeof(input_type) );
	}
	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img+j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	vbx_set_3D( m->search_width, m->block_height*sizeof(intermediate_type), sizeof(input_type), 0 );

	for( l = 0; l < m->search_height; l++ ) {
		//Accumulate each row into a vbx of row SADs
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height, sizeof(intermediate_type), sub_block_width*sizeof(input_type), m->block_width*sizeof(input_type) );
		vbx_acc_3D( VVBHU, VABSDIFF, m->v_row_sad, m->v_img+l*sub_block_width, m->v_block );

		//Accumulate the SADs
		vbx_set_vl( m->block_height/2 );
		vbx_set_2D( m->search_width, sizeof(output_type), m->block_height*sizeof(intermediate_type), m->block_height*sizeof(intermediate_type) );
		vbx_acc_2D( VVHWU, VADD, (vbx_uword_t*)m->v_result+l*m->search_width, m->v_row_sad, m->v_row_sad+(m->block_height/2) );

		//Transfer the line to host
		vbx_dma_to_host( result+l*m->search_width, m->v_result+l*m->search_width, m->search_width*sizeof(output_type) );

	}

	return VBW_SUCCESS;
}
Пример #2
0
/// Apply [1 2 1] low-pass filter to raw input row
/// NB: Last two output pixels are not meaningful
inline static void vbw_sobel_3x3_row(vbx_uhalf_t *lpf, vbx_uhalf_t *raw, const short image_width)
{
	vbx_set_vl(image_width-1);
	vbx(VVHU, VADD, lpf, raw, raw+1);
	vbx_set_vl(image_width-2);
	vbx(VVHU, VADD, lpf, lpf, lpf+1);
}
Пример #3
0
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int width, int height, int max_print_errors)
{
    int j, errors = 0;
    unsigned char** scalar_patterns = test_scalar_patterns(img, 0, width, height);

    vbx_ubyte_t* v_in = (vbx_ubyte_t*)vbx_sp_malloc(3*width*sizeof(vbx_word_t));
    vbx_ubyte_t* v_top = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_bot = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lbp = v_bot;

    unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    vbx_set_vl(width);
    for(j=0; j < height - 2; j++){
        vbx_dma_to_vector(v_in, img+j*width, 3*width*sizeof(unsigned char));
        vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+width); 
        vbx(VVHU, VCUSTOM1, v_bot, v_in+width, v_in+2*width); 
        vbx(SVHBU, VAND, v_top, 0xf0, v_top);
        vbx(SVHBU, VAND, v_bot, 0x0f, v_bot);
        vbx(VVBU, VADD, v_lbp, v_bot, v_top); 
        vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char));
        vbx_sync();

        errors = match_array_byte(lbp, scalar_patterns[0]+j*width, "custom_lbp", width-2, 1, max_print_errors, 1, j);

    }
    vbx_sp_free();
    vbx_shared_free(lbp);
    return errors;
}
Пример #4
0
int main(void)
{
	vbx_test_init();
	vbx_mxp_print_params();
	int errors=0;
	unsigned instr_cycles,instr_count, dma_cycles,dma_count;
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int lanes= this_mxp->vector_lanes;
	int dma_width=this_mxp->dma_alignment_bytes /4;
	debug(lanes);
	debug(dma_width);
	vbx_set_vl(-1);
	VBX_COUNTER_RESET();
	vbx(SVW,VMOV,0,0,0);
	vbx_sync();
	if(VBX_SIMULATOR)
		printf("simulator\n");
	else
		printf("not simulator\n");
	instr_cycles=VBX_GET_WRITEBACK_CYCLES();
	dma_cycles=VBX_GET_DMA_CYCLES();
	dma_count=VBX_GET_DMAS();
	instr_count=VBX_GET_INSTRUCTIONS();


	debug(instr_cycles);
	debug(dma_cycles);
	debug(dma_count);
	debug(instr_count );

	VBX_TEST_END(errors);
	return 0;
}
Пример #5
0
void vector_rectangle_to_luma(
	pixel *input_buffer,
	vbx_uhalf_t *v_luma_buffer, vbx_uhalf_t *v_row_temp, vbx_uword_t *v_row,
	int startx, int starty, int width, int height, const int image_pitch )
{
	int y;
	vbx_uhalf_t *v_luma;

	vbx_set_vl(width);

	for(y = 0; y < height; y++){
		v_luma = v_luma_buffer+(y*width);
		vbx_dma_to_vector(v_row, input_buffer+((y+starty)*image_pitch)+startx, width*sizeof(vbx_uword_t));

		//Move the b component into v_luma
		vbx(SVWHU, VAND, v_luma, 0xFF, v_row);
		vbx(SVHU,  VMUL, v_luma, 25,   v_luma);

		//Move g into v_row_temp and add it to v_luma
		vbx(SVWHU, VAND, v_row_temp, 0xFF,  (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+1));
		vbx(SVHU,  VMUL, v_row_temp, 129,    v_row_temp);
		vbx(VVHU,  VADD, v_luma,     v_luma, v_row_temp);

		//Move r into v_row_temp and add it to v_luma
		vbx(SVWHU, VAND, v_row_temp, 0xFF,  (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+2));
		vbx(SVHU,  VMUL, v_row_temp, 66,     v_row_temp);
		vbx(VVHU,  VADD, v_luma,     v_luma, v_row_temp);

		//divide by 2^8
		vbx(SVHU,  VSHR, v_luma, 8,  v_luma);
	}
}
Пример #6
0
int vbw_mtx_xp(vbx_sp_t *v_dst, vbx_sp_t *v_src, const int INROWS, const int INCOLS )
{
	vbx_set_vl( 1 );
	vbx_set_2D(  INCOLS, INROWS*sizeof(vbx_sp_t),       sizeof(vbx_sp_t), 0 );
	vbx_set_3D( INROWS,        sizeof(vbx_sp_t), INCOLS*sizeof(vbx_sp_t), 0 );
	vbxx_3D( VMOV, v_dst, v_src);
	return VBW_SUCCESS;
}
Пример #7
0
int compare_vbx_lut_to_vbx_lut_ci(int sz, int max_print_errors)
{
    int f, n, errors;

    vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));

    unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));
    unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));

    for (n = 0; n < sz; n++) {
        v_pattern[n] = n & 0xff;
    }

    int s, stage = 11;
    for (f = 0; f < face_lbp[stage].count; f++) {
        lbp_feat_t feat = face_lbp[stage].feats[f];

        vbx_set_vl(sz);
        int total = f;
        s = 0;
        while(s < stage){
            total += face_lbp[s].count;
            s++;
        }
        vbx(SVBU, VCUSTOM0, v_lutc, total, v_pattern);

        vbx(SVB, VMOV, v_pass, feat.fail, 0);
        /* check if pattern is in lut */
        vbx(SVBU, VSHR, v_group, 5, v_pattern);
        for (n = 0; n < 8; n++) {
            vbx(SVB, VADD, v_sel, -n, v_group);
            vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel);
        }

        vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern);
        vbx(VVWB, VSHR, v_lut, v_idx, v_lut);
        vbx(SVB, VAND, v_lut, 1, v_lut);
        vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut);

        vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char));
        vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char));
        vbx_sync();

        errors = match_array_byte(lut_c, lut, "custom_lut", sz, 1, max_print_errors, 0, 0);

    }
    vbx_sp_free();
    vbx_shared_free(lut);
    vbx_shared_free(lut_c);
    return errors;
}
Пример #8
0
Файл: test.c Проект: 8l/mxp
int deep_vector_copy_test()
{
	int retval;
	int num_test;
	int total_errors = 0;
	const int NUM_TESTS = TEST_DEEP_SP_NUM_TESTS;
	const int NB = vbx_sp_getfree();

	int NT = NB / sizeof(vbx_sp_t);

	vbx_sp_push();
	vbx_sp_t *v = vbx_sp_malloc( NB );

	srand( 0x1a84c92a );

	for( num_test=0; num_test < NUM_TESTS ; num_test++ ) {

		// initialize entire available scratchpad
		vbx_set_vl( NT );
		vbx( SE(T), VAND, v, MSK, 0 );

		// choose random src/dest/length:
		// -- randomly pick the dest
		// -- set a window size of 2*K around the dest
		// -- randomly pick the src within the window
		// -- randomly pick the length, subject to end-of-scratchpad
		// -- this 'window' rule increases probability of overlaps
		// -- rough distribution: 30% short (pipeline) overlaps, 20% long overlaps, 50% no overlap

		int K, N1, N2, NN;
		N1 = rand() % NT;
		K  = 1 + rand() % ((N1 > 0)? min(min(N1, NT-N1), 1024): min(NT, 1024));
		N2 = N1 - K + rand() % (2*K);
		NN = rand() % (NT - max(N1,N2));
		vbx_sp_t *dst = v + N1;
		vbx_sp_t *src = v + N2;

		printf("test:%d src:0x%08x dst:0x%08x len:%08d", num_test, N1, N2, NN );

		// do the copy
		retval = VBX_T(vbw_vec_copy)( dst, src, NN );
		vbx_sync();
		printf(" retval:0x%04x\n",retval);

		// ensure the copy was done properly
		int errors = verify_copy((vbx_mm_t *)v,     0,    N1,       0, "head")
		           + verify_copy((vbx_mm_t *)v,    N1, NN+N1, (N2-N1), "copy")
		           + verify_copy((vbx_mm_t *)v, NN+N1,    NT,       0, "tail");
		total_errors += errors;
		if( errors ) {
			//break;
		}
	}

	vbx_sp_pop();
	return total_errors;
}
Пример #9
0
vbx_mtx_fdct_t *
vbx_mtx_fdct_init( dt *coeff_v, dt *image )
{
	const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE;
	const int num_bytes = BIG_TILE_SIZE * sizeof(dt);
	const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt);

	//compute coeffs matrix in double and truncated to dt
	int i, j;
	double s;
	for (i = 0; i < BLOCK_SIZE; i++) {
		s = (i == 0) ? sqrt(0.125) : 0.5;
		for (j = 0; j < BLOCK_SIZE; j++) {
			c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5));
			cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999);
		}
	}

	vbx_sp_push();

	vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) );

	v->vcoeff    = (vbx_half_t *)vbx_sp_malloc( co_bytes );
	v->vprods    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
#if USE_ACCUM_FLAGS
	v->vaccum    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vflags    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
#endif

	// interleave ordering to ensure no false hazards
	v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes );

	v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	if( !v->vblock[1] ) {
		VBX_PRINTF( "ERROR: out of memory.\n" );
		VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes );

	int row;
	for( row=0; row < BLOCK_SIZE; row++ ) {
		getBigTileImageY(v->vimage[v->db],image,row);
	}
#if USE_ACCUM_FLAGS 
	// create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc
	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
	vbx( SEH, VAND,   v->vflags,       BLOCK_SIZE-1,      0 );
#endif

	return v;
}
Пример #10
0
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*.
 *  This function uses a merge reverse algorithm that is faster on large vectors.
 *  @pre v_src contains the elements to reverse.
 *  @pre v_src, v_scratch0, and v_scratch1 must all be the same length.
 *  @pre v_scratch1 and v_src must not overlap.
 *  @pre v_src *may* overlap v_scratch0 (will clobber v_src).
 *  @pre MXP must be 2 lanes or more.
 *  @pre N is a multiple of SP_WIDTH_B.
 *  @pre NUM_ROWS == N*4 / SP_WIDTH_B.
 *  @pre v_mask must be SP_WIDTH_B bytes long.
 *  @post v_scratch0 and v_scratch1 contents are modified, with one containing the result.
 *  @post v_src clobbered only if v_src overlaps v_scratch0.
 *
 *  @param[in]  v_scratch1 *in scratch*.
 *  @param[in]  v_src *in scratch*.
 *  @param[in]  N is the number of words to reverse.
 *  @param[in]  v_scratch0 *in scratch*.
 *  @param[in]  v_mask *in scratch*.
 *  @param[in]  SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time.
 *  @param[in]  NUM_ROWS is the number of rows of length SP_WIDTH_B bytes.
 *  @param[in]  rot16 TRUE to swap upper and lower half-words of each word in result.
 *  @returns    the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1,
 *              and will depend on log2(MXP vector lanes).
 */
static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0,
                                    vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 )
{
#if !VBX_SKIP_ALL_CHECKS
	if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) {
		VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short.");
		VBX_EXIT(-1);
	}
#endif

	vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 };
	unsigned int W = SP_WIDTH_B/4/2;                                         // half the number of words in a row
	unsigned int sel = 1;

	if( rot16 ) {
		vbx_set_vl( W );
		vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B );
		vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src );
		vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) );
	} else {
		vbx_set_vl( W );
		vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 );
		vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 );
		vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 );
	}

	vbx_set_vl( SP_WIDTH_B/4 );
	vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 );

	while( W > 1 ) {
		// set up odd/even mask register
		W /= 2;
		vbx( SEW, VAND, v_mask, W, 0 );
		vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask );
		vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask );
		sel = !sel;
	}

	return v_scratch[sel];
}
Пример #11
0
/** VBX Motion Estimation.
 *  Similar to the scalar version but scans vertically as it makes it easier to align vectors.
 *  vbw_mtx_motest_byte_setup should be run prior to running this function.
 *
 *  @param[out] result
 *  @param[in] x
 *  @param[in] y
 *  @param[in] m
 *  @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m)
{
	int  j;

	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img  +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	// column-ize the reference block
	vbx_set_vl( m->block_width );
	vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
	vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 );

	//Do column by column

	for( j=0; j < m->search_width; j++ )
	{
		// column-ize the search image
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height+m->search_height,  m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
		vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 );

		// search the image columnwise
		vbx_set_vl( m->block_width*m->block_height );
		vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0,  m->block_width*sizeof(input_type) );
		vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub );
	}

	// Write back result
	vbx_dma_to_host( result, m->v_result, m->result_size );

	return VBW_SUCCESS;
}
Пример #12
0
int vbw_sobel_argb32_3x3(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm)
{
	size_t free_sp=vbx_sp_getfree();
	size_t vectors_needed=8;
	size_t partial_width=free_sp/(vectors_needed*sizeof(vbx_uword_t));
	if(partial_width>image_width){
		vbw_sobel_argb32_3x3_partial(output, input, image_width, image_height, image_pitch,renorm);
	}else{
		//can do entire row at a time, so do partial_width at a time
		size_t partial_step=partial_width-2;
		int i;
		for(i=0;;i+=partial_step){
			//account for last tile being smaller
			if(i+partial_width > image_width){
				partial_width=image_width-i;
			}

			vbw_sobel_argb32_3x3_partial(output+i, input+i, partial_width, image_height, image_pitch,renorm);

			if(i+partial_width == image_width){
				//that was the last tile, so break,
				//I don't believe that this can be in the for statement
				break;
			}
		}
	}
	vbx_sp_push();
	vbx_word_t* side=vbx_sp_malloc(sizeof(vbx_word_t));
	vbx_set_vl(1);
	vbx(SVW,VMOV,side,0,0);
	vbx_dma_to_host_2D(output,/*host_ptr*/
	                   side,/*sp_ptr*/
	                   sizeof(vbx_word_t),/*row len*/
	                   image_height,/*num rows*/
	                   image_pitch*sizeof(vbx_word_t),/*host_incr*/
	                   0);/*sp incr*/
	vbx_dma_to_host_2D(output+image_width-1,/*host_ptr*/
	                   side,/*sp_ptr*/
	                   sizeof(vbx_word_t),/*row len*/
	                   image_height,/*num rows*/
	                   image_pitch*sizeof(vbx_word_t),/*host_incr*/
	                   0);/*sp incr*/
	vbx_sp_pop();
	vbx_sync();

}
Пример #13
0
/// Convert a row of aRGB pixels into luma values
/// v_luma should not equal v_row_in
/// Trashes v_temp
static void vbw_rgb2luma(vbx_uhalf_t *v_luma, vbx_uword_t *v_row_in, vbx_uhalf_t *v_temp, const int image_width)
{
	vbx_set_vl(image_width);

	// Move weighted B into v_luma
	vbx(SVWHU, VAND, v_temp, 0xFF,   v_row_in);
	vbx(SVHU,  VMUL, v_luma, 25,     v_temp);

	// Move weighted G into v_temp and add it to v_luma
	vbx(SVWHU, VAND, v_temp, 0xFF,   (vbx_uword_t*)(((vbx_ubyte_t *)v_row_in)+1));
	vbx(SVHU,  VMUL, v_temp, 129,    v_temp);
	vbx(VVHU,  VADD, v_luma, v_luma, v_temp);

	// Move weighted R into v_temp and add it to v_luma
	vbx(SVWHU, VAND, v_temp, 0xFF,   (vbx_uword_t*)(((vbx_ubyte_t *)v_row_in)+2));
	vbx(SVHU,  VMUL, v_temp, 66,     v_temp);
	vbx(VVHU,  VADD, v_luma, v_luma, v_temp);

	vbx(SVHU,  VADD, v_luma, 128,    v_luma); // for rounding
	vbx(SVHU,  VSHR, v_luma, 8,      v_luma);
}
Пример #14
0
static int isAbsOutOfRangeV( vptr_half v_src_r, vptr_half v_src_i, vptr_half v_temp, int n )
{

	//used for inverse only
	vbx_set_vl(n);
	vbx(SVH, VABSDIFF, v_temp, 0, v_src_r );    // get abs value of real
	vbx(SVH, VSUB, v_temp, 16383, v_temp );     // if (16383 - v_src) < 0, needs scaling
	vbx_acc(SVH, VCMV_LTZ, v_temp, 1, v_temp ); // accum # of neg values to see if scaling required
	vbx_sync();
	if( v_temp[0] ){
		return 1;
	}

	vbx(SVH, VABSDIFF, v_temp, 0, v_src_i );    // get abs value of imag
	vbx(SVH, VSUB, v_temp, 16383, v_temp );     // if (16383 - v_src) < 0, needs scaling
	vbx_acc(SVH, VCMV_LTZ, v_temp, 1, v_temp ); // accum # of neg values to see if scaling required
	vbx_sync();
	if( v_temp[0] ){
		return 1;
	}

	return 0;
}
Пример #15
0
Файл: test.c Проект: 8l/mxp
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int log, int width, int height, int max_print_errors)
{
    int j, l, cell, max_cell, errors = 0;
    unsigned char** scalar_patterns = test_scalar_patterns(img, log, width, height);

    max_cell = 1<<log;
    vbx_uhalf_t* v_in = (vbx_uhalf_t*)vbx_sp_malloc((1+2*max_cell)*width*sizeof(vbx_half_t));
    vbx_uhalf_t* v_top = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t));
    vbx_uhalf_t* v_bot = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t));
    vbx_ubyte_t* v_lbp = (vbx_ubyte_t*)v_bot;

    unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    vbx_set_vl(width);
    for(l = 0; l < 1; l++){
        cell = 1<<l;
        for(j=0; j < height - 2*cell; j++){
            vbx_dma_to_vector(v_in, img+j*width, (1+2*cell)*width*sizeof(unsigned short));
            vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+(1*cell)*width); 
            vbx(VVHU, VCUSTOM1, v_bot, v_in+(1*cell)*width, v_in+(2*cell)*width); 
            vbx(SVHBU, VAND, (vbx_ubyte_t*)v_top, 0xf0, v_top);
            vbx(SVHBU, VAND, (vbx_ubyte_t*)v_bot, 0x0f, v_bot);
            vbx(VVBU, VADD, v_lbp, v_bot, v_top); 
            vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char));
            vbx_sync();

            errors += match_array_byte(lbp, scalar_patterns[l]+j*width, "custom_lbp", width-2*cell, 1, 0, max_print_errors, 1, j);
            if (errors > max_print_errors){
                max_print_errors = 0;
            }

        }
    }
    vbx_sp_free();
    vbx_shared_free(lbp);
    return errors;
}
Пример #16
0
Файл: test.c Проект: 8l/mxp
int test_lbp_ci(unsigned short* img, int width, int height)
{

    vbx_uhalf_t* v_a1  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b1  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_1h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_uhalf_t* v_a2  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b2  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_2h  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_uhalf_t* v_a4  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b4  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_4h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_ubyte_t* v_1b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));
    vbx_ubyte_t* v_2b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));
    vbx_ubyte_t* v_4b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));

    unsigned short* lbp1h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));
    unsigned short* lbp2h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));
    unsigned short* lbp4h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));

    unsigned char* lbp1b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));
    unsigned char* lbp2b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));
    unsigned char* lbp4b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    img = img + width;

    vbx_dma_to_vector(v_a1, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b1, img + width, width*sizeof(unsigned short));
    vbx_dma_to_vector(v_a2, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b2, img + width, width*sizeof(unsigned short));
    vbx_dma_to_vector(v_a4, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b4, img + width, width*sizeof(unsigned short));
    vbx_sync();

    int i;
    int m = 48;
    for(i=0; i<m; i++){
        v_a1[i] = 0;
        v_b1[i] = 0;
        v_a2[i] = 0;
        v_b2[i] = 0;
        v_a4[i] = 0;
        v_b4[i] = 0;
    }
    int n = 12;
    int src_a1[] = {0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    int src_b1[] = {0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    int src_a2[] = {0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    int src_b2[] = {0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    int src_a4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0};
    int src_b4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0};
    
    for(i=0; i<16; i++){
        v_a1[i] = src_a1[i];
        v_b1[i] = src_b1[i];
        v_a2[i] = src_a2[i];
        v_b2[i] = src_b2[i];
        v_a4[i] = src_a4[i];
        v_b4[i] = src_b4[i];
    }

    vbx_set_vl(width);
    vbx(VVHU, VCUSTOM1, v_1h, v_a1, v_b1); 
    vbx(VVHU, VCUSTOM2, v_2h, v_a2, v_b2); 
    vbx(VVHU, VCUSTOM3, v_4h, v_a4, v_b4); 
    vbx(VVHB, VADD, v_1b, v_1h, ((vbx_byte_t*)v_1h) + 1);
    vbx(VVHB, VADD, v_2b, v_2h, ((vbx_byte_t*)v_2h) + 1);
    vbx(VVHB, VADD, v_4b, v_4h, ((vbx_byte_t*)v_4h) + 1);
    vbx_dma_to_host(lbp1h, v_1h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp2h, v_2h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp4h, v_4h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp1b, v_1b, width*sizeof(unsigned char));
    vbx_dma_to_host(lbp2b, v_2b, width*sizeof(unsigned char));
    vbx_dma_to_host(lbp4b, v_4b, width*sizeof(unsigned char));
    vbx_sync();

    test_print_array_half(v_a1, n);
    test_print_array_half(v_b1, n);
    test_print_hex_array_half(lbp1h, n);
    test_print_hex_array_byte(lbp1b, n);

    test_print_array_half(v_a2, n);
    test_print_array_half(v_b2, n);
    test_print_hex_array_half(lbp2h, n);
    test_print_hex_array_byte(lbp2b, n);

    test_print_array_half(v_a4, n);
    test_print_array_half(v_b4, n);
    test_print_hex_array_half(lbp4h, n);
    test_print_hex_array_byte(lbp4b, n);

    vbx_sp_free();
    vbx_shared_free(lbp1h);
    vbx_shared_free(lbp2h);
    vbx_shared_free(lbp4h);
    vbx_shared_free(lbp1b);
    vbx_shared_free(lbp2b);
    vbx_shared_free(lbp4b);
    return 0;
}
Пример #17
0
Файл: test.c Проект: 8l/mxp
//FIXME stride for match not implemented
int compare_LBPPassStage_to_restricted(unsigned short *vbx_img, int log, lbp_stage_t lbp_stage, int window, int width, int height, int max_print_errors)
{
    int l, i, j, cell, errors = 0;

    unsigned char** scalar_patterns = test_scalar_patterns(vbx_img, log, width, height);

    unsigned char *pass, *vbx_pass;
    pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char));
    vbx_pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char));
    
    vbx_byte_t** v_lbp =(vbx_byte_t**)vbx_shared_malloc((log+1)*sizeof(vbx_byte_t*));
    for (l=0; l<log+1; l++) {
        v_lbp[l] = (vbx_byte_t*)vbx_sp_malloc((window+1)*width*sizeof(vbx_byte_t)); 
    }
    vbx_byte_t* v_lut = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); 
    vbx_byte_t* v_stage = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); 
    vbx_byte_t* v_pattern;
    lbp_feat_t feat;
    int dx, dy, dw, f;

    for (l=0; l<log+1; l++) {
        vbx_dma_to_vector(v_lbp[l]+width, scalar_patterns[l], (window)*width*sizeof(unsigned char));
    }
    vbx_sync();
    for(j=0; j < height-(window+1); j++) {
        for (l=0; l<log+1; l++) {
            vbx_set_vl(width * window);
            vbx(VVB, VMOV, v_lbp[l], v_lbp[l]+width, NULL);
            vbx_dma_to_vector(v_lbp[l] + window*width, scalar_patterns[l]+(j+window)*width, width*sizeof(unsigned char));
        }

        vbx_set_vl(width-(window+1));
        vbx(SVB, VMOV, v_stage, 0, NULL);
        for (f = 0; f < lbp_stage.count; f++) {
            feat = lbp_stage.feats[f];
            dx = feat.pos.src.x;
            dy = feat.pos.src.y;
            dw = feat.pos.size.x;
            v_pattern = v_lbp[dw>>1]+(dy*width+dx);

            vbx(SVBU, VLBPLUT, v_lut, f, v_pattern);
            vbx(VVB, VADD, v_stage, v_stage, v_lut);
        }
        vbx(SVB, VMOV, v_lut, 0, NULL);
        vbx(SVB, VCMV_GEZ, v_lut, 1, v_stage);
        vbx_dma_to_host(vbx_pass + j*width, v_lut, (width-(window+1))*sizeof(unsigned char));
        vbx_sync();
    }


    unsigned int *iImg, *iiImg;
    iImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int));
    iiImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int));

    gen_integrals(vbx_img, iImg, iiImg, width, height);

    image_t lbp_img = {iImg, {width, height}};
    for (j = 0; j < height - (window + 1); j++) {
        for (i = 0; i < width - (window + 1); i++) {
            pair_t lbp_p = {i, j};
            pass[j*width+i] = LBPPassStage(lbp_img, lbp_stage, lbp_p);
        }
    }

    /* test pass vs vbx pass */
    for (j = 0; j < height - (window + 1); j++) {
        errors += match_array_byte(vbx_pass + j*width, pass + j*width, "pass stage", width - (window + 1), 1, 0, max_print_errors, 1, j);
        if (errors > max_print_errors){
            max_print_errors = 0;
        }
    }
    return errors;
}
Пример #18
0
int VectorBlox_MXP_Initialize(const char* mxp_dev,const char* cma_dev)
{
	PAGE_SIZE=sysconf(_SC_PAGESIZE);
	PAGE_SHIFT=0;
	int page_size=PAGE_SIZE;
	while((page_size>>=1)){
		PAGE_SHIFT++;
	}
	char filename[256];
	sprintf(filename,"/dev/%s",mxp_dev);
	the_mxp.mxp_fd=open(filename,O_RDWR);
	assert(the_mxp.mxp_fd);

	the_mxp.scratchpad_size = get_attr_from_file(mxp_dev,"SCRATCHPAD_KB") * 1024;
	the_mxp.scratchpad_addr = (void*)get_attr_from_file(mxp_dev,"C_S_AXI_BASEADDR");
	void* scratchpad_mmap = mmap(the_mxp.scratchpad_addr, //for the mapping to be the same as the physical mapping
	                             the_mxp.scratchpad_size,
	                             PROT_READ|PROT_WRITE,MAP_SHARED|MAP_FIXED,
	                             the_mxp.mxp_fd,4096);
	assert(scratchpad_mmap == the_mxp.scratchpad_addr);
	the_mxp.scratchpad_end  = (void*)get_attr_from_file(mxp_dev,"C_S_AXI_HIGHADDR")+1;

	//M_AXI_DATA_WIDTH is in bits, convert to bytes
	the_mxp.dma_alignment_bytes = get_attr_from_file(mxp_dev,"C_M_AXI_DATA_WIDTH")/8;
	the_mxp.vector_lanes = get_attr_from_file(mxp_dev,"VECTOR_LANES");
	the_mxp.scratchpad_alignment_bytes = the_mxp.vector_lanes * 4;

	the_mxp.vcustom0_lanes = get_attr_from_file(mxp_dev, "VCI_0_LANES");
	the_mxp.vcustom1_lanes = get_attr_from_file(mxp_dev, "VCI_1_LANES");
	the_mxp.vcustom2_lanes = get_attr_from_file(mxp_dev, "VCI_2_LANES");
	the_mxp.vcustom3_lanes = get_attr_from_file(mxp_dev, "VCI_3_LANES");
	the_mxp.vcustom4_lanes = get_attr_from_file(mxp_dev, "VCI_4_LANES");
	the_mxp.vcustom5_lanes = get_attr_from_file(mxp_dev, "VCI_5_LANES");
	the_mxp.vcustom6_lanes = get_attr_from_file(mxp_dev, "VCI_6_LANES");
	the_mxp.vcustom7_lanes = get_attr_from_file(mxp_dev, "VCI_7_LANES");
	the_mxp.vcustom8_lanes = get_attr_from_file(mxp_dev, "VCI_8_LANES");
	the_mxp.vcustom9_lanes = get_attr_from_file(mxp_dev, "VCI_9_LANES");
	the_mxp.vcustom10_lanes = get_attr_from_file(mxp_dev, "VCI_10_LANES");
	the_mxp.vcustom11_lanes = get_attr_from_file(mxp_dev, "VCI_11_LANES");
	the_mxp.vcustom12_lanes = get_attr_from_file(mxp_dev, "VCI_12_LANES");
	the_mxp.vcustom13_lanes = get_attr_from_file(mxp_dev, "VCI_13_LANES");
	the_mxp.vcustom14_lanes = get_attr_from_file(mxp_dev, "VCI_14_LANES");
	the_mxp.vcustom15_lanes = get_attr_from_file(mxp_dev, "VCI_15_LANES");

	the_mxp.mask_partitions = get_attr_from_file(mxp_dev,"MASK_PARTITIONS");

	the_mxp.max_masked_vector_length = get_attr_from_file(mxp_dev,"MAX_MASKED_WAVES")* the_mxp.vector_lanes * 4;
	the_mxp.fxp_word_frac_bits = get_attr_from_file(mxp_dev,"MULFXP_WORD_FRACTION_BITS");
	the_mxp.fxp_half_frac_bits = get_attr_from_file(mxp_dev,"MULFXP_HALF_FRACTION_BITS");
	the_mxp.fxp_byte_frac_bits = get_attr_from_file(mxp_dev,"MULFXP_BYTE_FRACTION_BITS");
	the_mxp.core_freq = get_attr_from_file(mxp_dev,"CLOCK_FREQ_HZ");
	the_mxp.instr_port_addr = mmap(NULL,PAGE_SIZE,PROT_READ|PROT_WRITE,MAP_SHARED,the_mxp.mxp_fd,0);


	sprintf(filename,"/dev/%s",cma_dev);
	the_mxp.cma_fd = open(filename,O_RDWR);
	assert(the_mxp.cma_fd);
	the_mxp.init = 0;

	the_mxp.sp = the_mxp.scratchpad_addr;

	the_mxp.spstack = (vbx_void_t **) NULL;
	the_mxp.spstack_top = (int) 0;
	the_mxp.spstack_max = (int) 0;

	_vbx_init(&the_mxp);
	//clear scratchpad
	vbx_set_vl(the_mxp.scratchpad_size);
	vbx(SVB,VMOV,(vbx_byte_t*)the_mxp.scratchpad_addr,0,0);

	return 0;
}
Пример #19
0
Файл: test.c Проект: 8l/mxp
int VBX_T(vbw_vec_reverse_test)()
{
	unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65,
	                      66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224,
	                      256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900,
	                      900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800,
	                      2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300,
	                      4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000,
	                      13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000,
	                      45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 };

	int retval;
	unsigned int N;
	unsigned int NBYTES;
	unsigned int NREPS = 100;
	unsigned int i,k;

	vbx_timestamp_t start=0,finish=0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;

	for( i=0; i<sizeof(aN)/4; i++ ) {
		N = aN[i];
		//printf( "testing with vector size %d\n", N );

		NBYTES = sizeof(vbx_sp_t)*N;
		if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue;

		vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES );
		vbx_sp_t *vdst = vbx_sp_malloc( NBYTES );
		//printf("bytes alloc: %d\n", NBYTES );

		if( !vsrc ) VBX_EXIT(-1);
		if( !vdst ) VBX_EXIT(-1);

		#if   ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF )
			unsigned int mask = 0x007F;
		#elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF )
			unsigned int mask = 0x7FFF;
		#else
			unsigned int mask = 0xFFFF;
		#endif

		vbx_set_vl( N );
		vbx( SV(T), VMOV, vdst,   -1, 0 );       // Fill the destination vector with -1
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Fill the source vector with enumerated values
		//VBX_T(print_vector)( "vsrcInit", vsrc, N );
		//VBX_T(print_vector)( "vdstInit", vdst, N );

		/** measure performance of function call **/
		vbx_sync();
		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N );
			vbx_sync();
		}
		finish = vbx_timestamp();
		printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );
		//VBX_T(print_vector)( "vsrcPost", vsrc, N );
		//VBX_T(print_vector)( "vdstPost", vdst, N );

		#if VERIFY_VBWARE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif

		printf("\treturn value: %X", retval);

		vbx_set_vl( N );
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Reset the source vector

		/** measure performance of simple algorithm **/
		vbx_sync();
		vbx_set_vl( 1 );
		vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );

		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 );
			vbx_sync();
		}
		finish = vbx_timestamp();

		printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );

		#if VERIFY_SIMPLE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif
			printf("\tcycles\n");

		vbx_sp_free();
	}

	vbx_sp_free();
	printf("All tests passed successfully.\n");

	return 0;
}
Пример #20
0
int vbw_mtx_median_ext_argb32( unsigned *output, unsigned *input, const int filter_height, const int filter_width,
                           const int image_height, const int image_width, const int image_pitch )
{

	const int FREE_BYTES = vbx_sp_getfree();
	int l,k;
	int filter_mid, filter_size;
	int rows_per_l,vl,temp_vl, temp_vl_byte;
	int j,i;
	int partial_row = 0;

	filter_size = filter_height*filter_width;
	filter_mid = filter_size/2;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_WIDTH_BYTES = this_mxp->scratchpad_alignment_bytes;

	// Could possibly check for low SP here (less than 6*VBX_WIDTH_BYTES) and assign vl differently

	// During allocation, max additional SP bytes needed due to alignment is one VBX_WIDTH_BYTES per vector
	// Taking that off the top simplifies calculation and will always be correct, but sacrifices a little SP space

	vl = (FREE_BYTES-3*VBX_WIDTH_BYTES)/((filter_size+2)*sizeof(vbx_uword_t));

	if( vl < 1 ) {
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	if(vl < image_width){
		rows_per_l = 1;
		partial_row = 1;
	} else {
		rows_per_l = vl/image_width;
		vl = image_width*rows_per_l;
	}

	vbx_sp_push();

	vbx_uword_t *v_input = (vbx_uword_t *)vbx_sp_malloc(filter_size*vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_sub   = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_temp  = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_min, *v_max;
	vbx_ubyte_t *v_input_byte = (vbx_ubyte_t *)v_input;
	if( v_temp == NULL ){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}
	for(l = 0; l < image_height-filter_height; l+= rows_per_l){
		// detect last pass
		if(l+rows_per_l > image_height-filter_height){
			rows_per_l = (image_height-filter_height)-l;
			vl = image_width*rows_per_l;
		}
		temp_vl = vl;
		for(k = 0; k < image_width; k += temp_vl){
			if(partial_row){
				if(k + temp_vl > image_width){
					temp_vl = image_width - k;
				}
			}

			for(j = 0; j < filter_height; j++){
				vbx_dma_to_vector_2D(v_input+temp_vl*j,
									 input+(l+j)*image_pitch+k,
									 temp_vl/rows_per_l*sizeof(vbx_uword_t),
									 rows_per_l,
									 image_width*sizeof(vbx_uword_t),
									 image_pitch*sizeof(vbx_uword_t));
			}

			// arrange all pixels within a filter window into single columns, seperated by temp_vl
			//
			// ex. vl = 5, filter = 3
			// vinput before         vinput after
			//
			// a00 a01 a02 a03 a04 | a00 a01 a02 a03 a04 |
			// a10 a11 a12 a13 a14 | a10 a11 a12 a13 a14 |
			// a20 a21 a22 a23 a24 | a20 a21 a22 a23 a24 |
			// ??? ??? ??? ??? ??? | a01 a02 a03 a04 a10 |
			// ??? ??? ??? ??? ??? | a11 a12 a13 a14 a20 |
			// ??? ??? ??? ??? ??? | a21 a22 a23 a24 a30 |
			// ??? ??? ??? ??? ??? | a02 a03 a04 a10 a11 |
			// ??? ??? ??? ??? ??? | a12 a13 a14 a20 a21 |
			// ??? ??? ??? ??? ??? | a22 a23 a24 a30 a31 |
			//
			vbx_set_vl(temp_vl);
			for(j = 1; j < filter_height; j++){
				for(i = 0; i < filter_width; i++){
					vbx(VVWU, VMOV, v_input+(j*filter_height+i)*temp_vl,
									v_input+i*temp_vl+j,
									0);
				}
			}

			//Do the bubble sort up to the filter_size/2^th element on each vbx

			// work on individual color channels
			temp_vl_byte = temp_vl*sizeof(vbx_uword_t)/sizeof(vbx_ubyte_t);
			vbx_set_vl(temp_vl_byte);

			// sort lower half of the values in the window
			for(j = 0; j < filter_mid; j++){
				v_min = v_input_byte+j*temp_vl_byte;

				for(i = j+1; i < filter_size; i++){
					v_max = v_input_byte+i*temp_vl_byte;

					vbx(VVBU, VMOV,     v_temp, v_min,  0);
					vbx(VVBU, VSUB,     v_sub,  v_max,  v_min);
					vbx(VVBU, VCMV_LTZ, v_min,  v_max,  v_sub);
					vbx(VVBU, VCMV_LTZ, v_max,  v_temp, v_sub);
				}
			}

			// grab next smallest value, the median, don't sort the rest
			v_min = v_input_byte+filter_mid*temp_vl_byte;
			for(i = filter_mid+1; i < filter_size; i++){
				v_max = v_input_byte+i*temp_vl_byte;

				vbx(VVBU, VSUB,     v_sub, v_max, v_min);
				vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub);
			}

			// dma out median value
			// back to pixels
			vbx_dma_to_host_2D(output+(l*image_pitch)+k,
							   v_input+temp_vl*filter_mid,
							   temp_vl/rows_per_l*sizeof(vbx_uword_t),
							   rows_per_l,
							   image_pitch*sizeof(vbx_uword_t),
							   image_width*sizeof(vbx_uword_t));
		}
	}

	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}
Пример #21
0
int vbw_bifilt_argb32_3x3(unsigned *output, unsigned *input, short image_width, const short image_height, const short image_pitch, const short renorm)
{

//return vbw_sobel_argb32_3x3( output, input, image_width, image_height, image_pitch, renorm);

	int y;
	int xx, yy, sharp;

	vbx_uword_t *v_row_in;
	vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot;
	vbx_ubyte_t *v_luma_hii,              *v_luma_low;
	vbx_ubyte_t *v_src[W][W];

	vbx_uword_t *v_row_out;

	vbx_ubyte_t *v00, *v01, *v02, *v10, *v11, *v12, *v20, *v21, *v22;
#if W==5
	vbx_ubyte_t *v03, *v04,       *v13, *v14,       *v23, *v24;
	vbx_ubyte_t *v30, *v31, *v32, *v40, *v41, *v42;
	vbx_ubyte_t *v33, *v34,       *v43, *v44;
#endif
	vbx_ubyte_t *v[W][W];

	vbx_uhalf_t *vI, *vW, *vT;  // vT== temporary


	vbx_sp_push();

	// Allocate space in scratchpad for vectors
	struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t),
	                                                          input,input+image_height*image_pitch,
	                                                          image_pitch*sizeof(vbx_uword_t));

	v_row_out  = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t));
	vT         = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
#if 1
	// save some space by overlapping with v_row_out
	vW         = (vbx_uhalf_t*)v_row_out;
	vI         = (vbx_uhalf_t*)v_row_out + image_width;
#else
	vW         = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	vI         = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
#endif

#if W==3
	v_luma_top      = (vbx_ubyte_t*)vbx_sp_malloc( 3 * image_width*sizeof(vbx_ubyte_t));
	v_luma_mid      = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_bot      = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ;
#else
	v_luma_top      = (vbx_ubyte_t*)vbx_sp_malloc( 5 * image_width*sizeof(vbx_ubyte_t));
	v_luma_hii      = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_mid      = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_low      = v_luma_top + 3 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_bot      = v_luma_top + 4 * image_width*sizeof(vbx_ubyte_t) ;
#endif


	if(v_luma_bot==NULL){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	// Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations
#if W==3
	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(vW, v_row_in, vT, image_width);                                // 1st luma row
	vbx( SVHBU, VSHR, v_luma_top, 8, vW );                                     // convert to byte

	v_row_in = rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_mid, 8,  vW );                                    // convert to byte

#else
	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(vW, v_row_in, vT, image_width);                                // 1st luma row
	vbx( SVHBU, VSHR, v_luma_top, 8, vW );                                     // convert to byte

	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_hii, 8,  vW );                                    // convert to byte

	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_mid, 8,  vW );                                    // convert to byte

	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_low, 8,  vW );                                    // convert to byte
#endif


	// blank out the top and bottom rows
	unsigned *out;
	vbx_set_vl(image_width);
	unsigned COLOUR = ( 200 | (128<<8) | (244<<16) );
	vbx(SVWU, VMOV, v_row_out, COLOUR, 0);
	for( y=0; y<W/2; y++ ) {
		// Set top output rows to 0
		out = output + image_width*y;
		vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) );
		// Set bottom rows to 0
		out = output + image_width*(image_height-1-y);
		vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) );
	}



	// Calculate edges
	for (y = 0; y < image_height-(W-1); y++) {

		vbx_set_vl(image_width);
		// Transfer the next input row while processing
		rp_fetch(&v_row_db);
		v_row_in = rp_get_buffer(&v_row_db,0);
		// Convert aRGB input to luma
		vbw_rgb2luma( vW, v_row_in, vT, image_width);
		vbx( SVHBU, VSHR, v_luma_bot, 8,  vW );                                     // convert to byte

vbx_sp_push();
		image_width=image_width/2;
		vbx_set_vl(image_width);

		v[0][0] = v00   = (vbx_ubyte_t*)vbx_sp_malloc( 25 * image_width*sizeof(vbx_ubyte_t));
		v[0][1] = v01   = v00 +  1 * image_width*sizeof(vbx_ubyte_t) ;
		v[0][2] = v02   = v00 +  2 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][0] = v10   = v00 +  3 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][1] = v11   = v00 +  4 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][2] = v12   = v00 +  5 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][0] = v20   = v00 +  6 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][1] = v21   = v00 +  7 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][2] = v22   = v00 +  8 * image_width*sizeof(vbx_ubyte_t) ;

	#if W==5
		v[0][3] = v03   = v00 +  9 * image_width*sizeof(vbx_ubyte_t) ;
		v[0][4] = v04   = v00 + 10 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][3] = v13   = v00 + 11 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][4] = v14   = v00 + 12 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][3] = v23   = v00 + 13 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][4] = v24   = v00 + 14 * image_width*sizeof(vbx_ubyte_t) ;

		v[3][0] = v30   = v00 + 15 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][1] = v31   = v00 + 16 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][2] = v32   = v00 + 17 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][3] = v33   = v00 + 18 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][4] = v34   = v00 + 19 * image_width*sizeof(vbx_ubyte_t) ;

		v[4][0] = v40   = v00 + 20 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][1] = v41   = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][2] = v42   = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][3] = v43   = v00 + 23 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][4] = v44   = v00 + 24 * image_width*sizeof(vbx_ubyte_t) ;
	#endif

		if(v00==NULL){
printf("mem alloc failed\n"); fflush(stdout);
			vbx_sp_pop();
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}


//FIXME -- how to manage row buffers with 5 rows?  3 rows are shown below:
#if W==3
		for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx;
		for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_mid+xx;
		for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_bot+xx;
#else
		for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx;
		for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_hii+xx;
		for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_mid+xx;
		for( xx=0; xx<W; xx++ ) v_src[3][xx] = v_luma_low+xx;
		for( xx=0; xx<W; xx++ ) v_src[4][xx] = v_luma_bot+xx;
#endif

		vbx_set_vl( image_width - W + 1 );

		// compute error (absdiff) in pixel colour with neighbours
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( VVBU, VABSDIFF, v[yy][xx], v_luma_mid+(W/2), v_src[yy][xx] );
			}
		}


		// v[][] holds the errors (differences) between pixels
		// efficiently compute a function that looks approximately something like exp(-x):
		//     large value for small errors, small value for big errors
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( SVBU, VABSDIFF, v[yy][xx], 255, v[yy][xx] );  // 255 - img_err
				// 11 or more iterations is mathematically equivalent to a pure gaussian blur // FIXME is this true?
#define NUM_SHARPEN_ITERATIONS  3   // 0 to 10 iterations, practical max is 7 or 8
				for( sharp=0; sharp < NUM_SHARPEN_ITERATIONS; sharp++ ) {
					vbx( VVBU, VMULHI, v[yy][xx], v[yy][xx], v[yy][xx] ); // v*v;
				}
			}
		}

		// with right decimal place, could do the next two instructions using MULFXP and do as BYTES
		// convolve errors with gaussian blur kernel
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v[yy][xx] );
			}
		}

		// sum up the weights for normalization later
		vbx( VVBHU, VADD, vW, v[0][0], v[0][1] );
		vbx( VVBHU, VADD, vT, v[0][2], v[1][0] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[1][1], v[1][2] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[2][0], v[2][1] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[2][2], 0 );
		vbx( VVHU,  VADD, vW, vW, vT );
	#if (W==5)
		vbx( VVBHU, VADD, vT, v[3][0], v[3][1] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[3][2], v[4][0] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[4][1], v[4][2] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
	#endif


		// convolve image with new weights
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( VVBU, VMULHI, v[yy][xx], v_src[yy][xx], v[yy][xx] );
				//vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v_src[yy][xx] );
				//vbx( SVBU, VMUL  , v[yy][xx],         1      , v_src[yy][xx] );
			}
		}



		// sum up the weighted pixels
		vbx( VVBHU, VADD, vI, v[0][0], v[0][1] );
		vbx( VVBHU, VADD, vT, v[0][2], v[1][0] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[1][1], v[1][2] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[2][0], v[2][1] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[2][2], 0 );
		vbx( VVHU,  VADD, vI, vI, vT );

	#if (W==5)
		vbx( VVBHU, VADD, vT, v[3][0], v[3][1] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[3][2], v[4][0] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[4][1], v[4][2] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
	#endif


// keep RHS of image as original grayscale
image_width=image_width*2;
vbx_set_vl( image_width/2 );
//vbx( VVWHU, VMOV, vT+image_width/2, (v_row_in       ) + image_width/2+1, 0 );
vbx( VVBHU, VMOV, vT+image_width/2, (v_src[ 0 ][ 0 ]) + image_width/2+1, 0 );
vbx_sp_pop(); // don't need v[][] data any more

// compute LHS of image
#if 0
		vbx( VVBHU, VMOV, vT, v_src[2][2], 0 );
		//vbx( SVHU, VSHR, vI,  3, vI );
		//vbx( SVHU, VSHR, vW,  3, vW );
		//vbx( VVHU, VMUL, vT, vI, vW );
		//vbx( SVHU, VSHR, vT,  8, vT );
#else
		uint32_t h = image_width/2;
		vbx( SVHU, VADD, vW, 0x80, vW ); // round
		vbx( SVHU, VSHR, vW,    8, vW );
		vbw_vec_divide_uhalf( vT  , vI  , vW  , h                 );
		//vbw_vec_divide_uhalf( vT+h, vI+h, vW+h, image_width-W+1-h );
#endif
		// ensure LHS doesn't overflow
		vbx( SVHU, VAND, vT, 0xff, vT );

		// Copy the result to the low byte of the output row
		// Trick to copy the low byte (b) to the middle two bytes as well
		// Note that first and last columns are 0
		vbx_set_vl(image_width-W+1);
		vbx(SVHWU, VMULLO, v_row_out+W/2, 0x00010101, vT);

		// blank out left and right edges
		// then DMA the result to the output
		vbx_set_vl(W/2);
		vbx(SVWU, VMOV, v_row_out, COLOUR, 0 );
		vbx(SVWU, VMOV, v_row_out + image_width - (W/2), COLOUR, 0 );
		vbx_dma_to_host( output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t) );

		// Rotate luma buffers
		vbx_ubyte_t *tmp_ptr;
		tmp_ptr      = v_luma_top;
#if W==3
		v_luma_top   = v_luma_mid;
		v_luma_mid   = v_luma_bot;
		v_luma_bot   = tmp_ptr;
#else
		v_luma_top   = v_luma_hii;
		v_luma_hii   = v_luma_mid;
		v_luma_mid   = v_luma_low;
		v_luma_low   = v_luma_bot;
		v_luma_bot   = tmp_ptr;
#endif

	}

	vbx_sync();
	vbx_sp_pop();

	return VBW_SUCCESS;
}
Пример #22
0
/** Luma Edge Detection.
 *
 * @brief 3x3 Sobel edge detection with 8-bit luma image
 *
 * @param[out] output      32-bit aRGB edge-intensity output
 * @param[in] input        8-bit luma input
 * @param[in] image_width  Image width in pixels
 * @param[in] image_height Image height in pixels
 * @param[in] image_pitch  Distance in pixels between the start of subsequent rows. usually equal to image_width
 * @param[in] renorm       Number of bits to shift the final intensity by to the right
 * @returns Negative on error condition. See vbw_exit_codes.h
 */
int vbw_sobel_luma8_3x3(unsigned *output, unsigned char *input, const short image_width, const short image_height, const short image_pitch, const short renorm)
{
	int y;

	vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot;
	vbx_uword_t *v_row_out;

	vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot;
	vbx_uhalf_t *v_gradient_x, *v_gradient_y;
	vbx_uhalf_t *v_tmp;

	void *tmp_ptr;

	vbx_sp_push();

	// Allocate space in scratchpad for vectors
	rotating_prefetcher_t v_luma=rotating_prefetcher(3,image_width*sizeof(vbx_ubyte_t),
	                                                 input,input+image_height*image_pitch,
	                                                 image_pitch*sizeof(vbx_ubyte_t));
	v_sobel_row_top = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_mid = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_bot = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_gradient_x    = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_gradient_y    = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_row_out       = (vbx_uword_t *)vbx_sp_malloc(image_width*sizeof(vbx_uword_t));
	if(v_row_out==NULL) {
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	// Transfer the first 3 input rows and interleave first 2 sobel row calculations
	rp_fetch(&v_luma);
	rp_fetch(&v_luma);
	v_luma_top=rp_get_buffer(&v_luma, 0);
	vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top,image_width);
	rp_fetch(&v_luma);
	v_luma_mid=rp_get_buffer(&v_luma, 1);
	vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width);

	// Set top output row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t));

	// Calculate edges
	for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) {
		// Transfer the next input row while processing
		rp_fetch(&v_luma);
		v_luma_top=rp_get_buffer(&v_luma,0);
		v_luma_mid=rp_get_buffer(&v_luma,1);
		v_luma_bot=rp_get_buffer(&v_luma,2);
		// Start calculating gradient_x
		vbx_set_vl(image_width);
		vbx(SVBHU, VSHL, v_gradient_x, 1, v_luma_mid); // multiply by 2

		// Calculate gradient_y
		// Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row
		vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot);

		// Re-use v_sobel_row_top
		v_tmp = v_sobel_row_top;

		// Finish calculating gradient_x
		// Apply [1 2 1]T matrix to all columns
		vbx_set_vl(image_width);
		vbx(VVBHU, VADD, v_tmp, v_luma_top, v_luma_bot);
		vbx(VVHU,  VADD, v_tmp, v_tmp,      v_gradient_x);
		// For each column, calculate absolute difference with 2nd column to the right
		vbx_set_vl(image_width-2);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2);

		// sum of absoute gradients
		//vbx_set_vl(image_width-2);
		vbx(VVHU, VADD, v_tmp, v_gradient_x,  v_gradient_y);
		vbx(SVHU, VSHR, v_tmp, renorm, v_tmp);

		// Threshold
		vbx(SVHU, VSUB,     v_gradient_y, 255, v_tmp);
		vbx(SVHU, VCMV_LTZ, v_tmp,        255, v_gradient_y);

		// Copy the result to the low byte of the output row
		// Trick to copy the low byte (b) to the middle two bytes as well
		// Note that first and last columns are 0
		//vbx_set_vl(image_width-2);
		vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp);

		// DMA the result to the output
		vbx_dma_to_host(output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t));


		// Rotate v_sobel_row buffers (for gradient_y)
		tmp_ptr         = (void *)v_sobel_row_top;
		v_sobel_row_top = v_sobel_row_mid;
		v_sobel_row_mid = v_sobel_row_bot;
		v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr;
	}

	// Set bottom row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t));

	vbx_sync();
	vbx_sp_pop();

	return VBW_SUCCESS;
}
Пример #23
0
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N )
{
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;

	vbx_mxp_t *this_mxp            = VBX_GET_THIS_MXP();
	const unsigned int NUM_LANES   = this_mxp->vector_lanes;

	//printf("\n%d\n",VBX_SKIP_ALL_CHECKS);

	// Can the whole vector fit in the scratchpad width?
	if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){
		vbx_set_vl( 1 );
		vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+N-1, v_src);
		return VBW_SUCCESS;
	}

	unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
	                            NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
	                            NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX);

	unsigned int N_w          = N >> VBW_RSHIFT_T_TO_W;                  // Equivalent number of words in the vector

	if( N_w && N_w <= threshold_w ) {
		if( VBW_ROT16){
			// remainder of elements that can't add to a whole word
			unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T);
			if( stub_t ) {
				vbx_set_vl( 1 );
				vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
				vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t);
				v_dst += stub_t;
			}
			vec_rev_rot16_w(v_dst, v_src, N_w);
		}else{
			vec_rev_w(v_dst, v_src, N_w);
		}

		if( VBW_ROT8){
			vec_rot8_h(v_dst, v_dst, N_w*2);
		}
		return VBW_SUCCESS;
	}


	const unsigned int SP_WIDTH_B       = this_mxp->scratchpad_alignment_bytes;
	const unsigned int FREE_BYTES       = vbx_sp_getfree();
	const unsigned int ODD_LOG_SEL      = NUM_LANES & 0x55555555 ? 1 : 0;

	vbx_word_t *v_mask, *v_result;
	vbx_word_t *v_scratch[2] = {0,0};

	unsigned int num_rows_w    = N_w / NUM_LANES;
	unsigned int working_set_w = num_rows_w * NUM_LANES;
	unsigned int tail_t        = N - (working_set_w << VBW_LSHIFT_W_TO_T);
	unsigned int remaining_w   = working_set_w;

	if( tail_t ) {
		vbx_set_vl( 1 );
		vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t);
		v_dst += tail_t;
	}

	vbx_word_t *v_src_w = (vbx_word_t *)v_src;
	vbx_word_t *v_dst_w = (vbx_word_t *)v_dst;

	if(!num_rows_w) {
		return VBW_SUCCESS;
	}

	remaining_w = working_set_w;
	while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) {
		if( remaining_w <= threshold_w*2 ) {
			if( VBW_ROT16){
				vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w);
			}else{
				vec_rev_w(v_dst_w, v_src_w, remaining_w);
			}

			if( VBW_ROT8){
				vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2);
			}
			return VBW_SUCCESS;
		}

		working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES );
		v_mask = v_dst_w + (working_set_w*2);
		remaining_w -= working_set_w;

		v_scratch[0] = v_dst_w;
		v_scratch[1] = v_dst_w + working_set_w;
		num_rows_w = working_set_w / NUM_LANES;
		v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w,
		                            v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
		if( v_result != v_dst_w ) {
			VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
			VBX_EXIT(-1);
		}
#endif

		if( VBW_ROT8){
			vec_rot8_h(v_result, v_result, working_set_w*2);
		}
		v_dst_w += working_set_w;
	}


	vbx_sp_push();

	v_scratch[0] = v_dst_w;
	v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[1] ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	num_rows_w = remaining_w / NUM_LANES;
	v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL],
	                            v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
	if( v_result != v_dst_w ) {
		VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
		VBX_EXIT(-1);
	}
#endif

	if( VBW_ROT8){
		vec_rot8_h(v_result, v_result, remaining_w*2);
	}
	vbx_sp_pop();
	return VBW_SUCCESS;
}
Пример #24
0
void vbx_mtx_fdct( vbx_mtx_fdct_t *v, dt *block_v, dt *image,
	int start_x, int start_y, int end_x, int end_y,int num_tile_x, int num_tile_y )
{
//	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
//	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int BIG_TILE_SIZE = num_tile_x * num_tile_y * DCT_SIZE;

	int next_x=start_x+1;
	int next_y=start_y;
	int get_next=1;
	if( start_x == end_x   &&   start_y == end_y ) {
		get_next=0;
	}
	if( start_x == end_x ) {
		next_x = 0;
		next_y++;
	} 

	const vbx_half_t *vimageDMA = v->vimage[!v->db]; // dma
//	const vbx_half_t *vblockDMA = v->vblock[!v->db]; // dma // never used directly 

	const vbx_half_t *vimageVPU = v->vimage[ v->db]; // active
	const vbx_half_t *vblockVPU = v->vblock[ v->db]; // active

	const vbx_half_t *vblockTMP = v->vblock[ 2    ]; // temp

	const vbx_half_t *vcoeff    = v->vcoeff;
	const vbx_half_t *vprods    = v->vprods;
	const vbx_half_t *vaccum    = v->vaccum;
	const vbx_half_t *vflags    = v->vflags;

#if DMA
	// First, prefetch the next chunk of the next image for a future call to fdct_tile()
#if NUM_TILE_Y > 1
	if( get_next ) // get row 0
		getBigTileImageY( vimageDMA,
		        image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 );
#else
	if( get_next ) // get row 0
		getBigTileImage( vimageDMA,
		        image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 );
#endif
#endif

	int r;
	for( r=0; r < BLOCK_SIZE; r++ ) {
		// perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE );                                                                                              // for the length of tiled rows
		vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt),                                    0, NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows of tiled coeffiencents
		vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt),     NUM_TILE_X * DCT_SIZE*sizeof(dt),                               0  ); // for all groups Y
		vbx_3D( VVH, VMUL,                                vprods, vimageVPU + r*NUM_TILE_X*BLOCK_SIZE,                            vcoeff); // for all 'columns' of tiled data

#if ACCUMULATE
		// accumulate the multiply operations
#if 0 & USE_ACCUM_FLAGS 
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods+0, vprods+1 );
		vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) );
		vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags );
#elif BLOCK4
                //case DCT 4
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods, vprods+1 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+3 );
		vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags );
#else
                //correct?
		vbx_set_vl( BLOCK_SIZE );
		vbx_set_2D( BLOCK_SIZE,   NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) );
		vbx_set_3D( NUM_TILE_X,   BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt) );
#if NUM_TILE_Y == 1
		vbx_acc_3D( VVH, VOR,   vblockTMP + r,      vprods ,  vprods );
#else
		int y; 
		for (y=0; y< NUM_TILE_Y; y++){
			vbx_acc_3D( VVH, VOR,   vblockTMP + r + y*NUM_TILE_X*DCT_SIZE,      vprods+ y*NUM_TILE_X*DCT_SIZE,  vprods+ y*NUM_TILE_X*DCT_SIZE );
		}
#endif
#endif
#endif

#if 0
// dont do DMA READS here yet. a DMA WRITE may still be in progress, give it chance to finish
#if DMA
		// every other iteration, prefetch the next row of the next image
		// NB: with 2D DMA, we could issue this as a single DMA request at the top of the file
		// instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue
#if NUM_TILE_Y > 1
		if( !(r&1) && get_next )
			getBigTileImageY( vimageDMA,
			                  image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                  (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added
#else
		if( !(r&1) && get_next )
			getBigTileImage( vimageDMA,
			                 image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                 (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added
#endif
#endif
#endif
	}

	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE );
	vbx( SVH, VSHR, vblockTMP, SHIFT_AMOUNT, vblockTMP );

	// now do the transposed version

	for( r=0; r < BLOCK_SIZE; r++ ) {
		// perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE );                                                                                              // for the length of tiled rows
		vbx_set_2D( BLOCK_SIZE, NUM_TILE_X * BLOCK_SIZE*sizeof(dt),     NUM_TILE_X * BLOCK_SIZE*sizeof(dt),                            0 ); // for all 'columns' of tiled data 
		vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt),       NUM_TILE_X * DCT_SIZE*sizeof(dt),                              0 ); // for all groups Y
		vbx_3D( VVH, VMUL,                             vprods,                        vblockTMP,  vcoeff + r*NUM_TILE_X*BLOCK_SIZE); // for all rows of tiled coeffients 

#if ACCUMULATE
		// accumulate the multiply operations
#if 0 & USE_ACCUM_FLAGS
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods+0, vprods+1 );
		vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) );
		vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VCMV_Z, vblockVPU+r, vaccum,   vflags );

#elif BLOCK4
		//case DCT 4
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods, vprods+1 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+3 );
		//vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags );
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE - (BLOCK_SIZE-1) );                    // for the length of a tiled row
		vbx_set_2D( BLOCK_SIZE, 1*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0);// for all tiled rows 
#if NUM_TILE_Y == 1
		vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE, vaccum, vflags  );  // 
#else
		int y;
		for (y=0; y< NUM_TILE_Y; y++){
			vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE , vaccum+y*NUM_TILE_X*DCT_SIZE, vflags  );  // 
		}
#endif
#else
		//correct?
		vbx_set_vl( BLOCK_SIZE );                                                                                              // for the length of a row
		vbx_set_2D( BLOCK_SIZE,   sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) );            // for all rows in that block
		vbx_set_3D( NUM_TILE_X,   BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt) ); // for all tiled blocks horizontally(x)
#if NUM_TILE_Y == 1
		vbx_acc_3D( VVH, VOR,   vblockVPU + r*NUM_TILE_X*BLOCK_SIZE ,    vprods ,  vprods );
#else
		int y;
		for (y=0; y< NUM_TILE_Y; y++){ 
			vbx_acc_3D( VVH, VOR,   vblockVPU + r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE,      vprods+ y*NUM_TILE_X*DCT_SIZE,  vprods+ y*NUM_TILE_X*DCT_SIZE );
		}
#endif
#endif
#endif

#if DMA
		// every other iteration, prefetch the next row of the next image
		// NB: with 2D DMA, we could issue this as a single DMA request at the top of the file
		// instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue
#if NUM_TILE_Y > 1
		//if( !(r&1) && r<(BLOCK_SIZE-1)  && get_next )
		if( get_next )
			getBigTileImageY( 
			                  vimageDMA,
			                  image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                  r );
			                  //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows
#else
		//if( !(r&1) && r<(BLOCK_SIZE-1)  && get_next )
		if( get_next )
			getBigTileImage( vimageDMA,
			                 image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                 r );
			                 //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows
#endif
#endif
	}

	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE );
	vbx( SVH, VSHR, vblockVPU, SHIFT_AMOUNT, vblockVPU );
#if DMA2
	// Write result back to memory as one big block
	vbx_dma_to_host( block_v, vblockVPU, BIG_TILE_SIZE*sizeof(dt) );
#endif 

	v->db = !v->db;
#ifdef DEBUG 
	{
		vbx_sync();
		int i,j;
		printf("%d\n", !db);
		for(i=0;i<BLOCK_SIZE*NUM_TILE_Y;i++){
			for(j=0;j<BLOCK_SIZE*NUM_TILE_X;j++){
				printf(" %4d", block_v[i*BLOCK_SIZE*NUM_TILE_X+j]);
			}
			printf("\n");
		}
	}
#endif
}
Пример #25
0
/** Luma Edge Detection
 *
 * @brief 3x3 Sobel edge detection with 32-bit aRGB image
 *
 * @param[out] output      32-bit aRGB edge-intensity output
 * @param[in] input        32-bit aRGB input
 * @param[in] image_width  Image width in pixels
 * @param[in] image_height Image height in pixels
 * @param[in] image_pitch  Distance in pixels between the start of subsequent rows. usually equal to image_width
 * @param[in] renorm       Number of bits to shift the final intensity by to the right
 * @returns Negative on error condition. See vbw_exit_codes.h
 */
int vbw_sobel_argb32_3x3_partial(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm)
{

	int y;

	vbx_uword_t *v_row_in;
	vbx_uhalf_t *v_luma_top, *v_luma_mid, *v_luma_bot;
	vbx_uword_t *v_row_out;

	vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot;
	vbx_uhalf_t *v_gradient_x, *v_gradient_y;
	vbx_uhalf_t *v_tmp;

	void *tmp_ptr;

	vbx_sp_push();

	// Allocate space in scratchpad for vectors
	struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t),
	                                                          input,input+image_pitch*image_width,
	                                                          image_pitch*sizeof(vbx_uword_t));

	v_luma_top      = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_luma_mid      = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_luma_bot      = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_top = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_mid = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_bot = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_row_out       = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t));

	if(v_row_out==NULL){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	// Re-use v_sobel_row_bot as v_tmp
	v_tmp = v_sobel_row_bot;

	// Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations
	rp_fetch(&v_row_db);
	rp_fetch(&v_row_db);
	v_row_in=rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(v_luma_top, v_row_in, v_tmp, image_width);                                          // 1st luma row


	vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top, image_width);                                     // 1st partial sobel row
	rp_fetch(&v_row_db);
	v_row_in=rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(v_luma_mid, v_row_in, v_tmp, image_width);                               // 2nd luma row
	vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width);                                     // 2nd partial sobel row

	// Set top output row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t));

	// Calculate edges
	for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) {
		// Transfer the next input row while processing
		rp_fetch(&v_row_db);
		v_row_in=rp_get_buffer(&v_row_db,0);
// Re-use v_sobel_row_bot as v_tmp
		v_tmp = v_sobel_row_bot;

		// Convert aRGB input to luma
		vbw_rgb2luma(v_luma_bot, v_row_in, v_tmp, image_width);
		// Done with v_row_in; re-use for v_gradient_x and v_gradient_y (be careful!)
		v_gradient_x = (vbx_uhalf_t *)v_row_in;
		v_gradient_y = (vbx_uhalf_t *)v_row_in + image_width;

		// Calculate gradient_x
		// Apply [1 2 1]T matrix to all columns
		vbx_set_vl(image_width);
		vbx(SVHU, VSHL, v_gradient_x, 1,          v_luma_mid); // multiply by 2
		vbx(VVHU, VADD, v_tmp,        v_luma_top, v_luma_bot);
		vbx(VVHU, VADD, v_tmp,        v_tmp,      v_gradient_x);
		// For each column, calculate absolute difference with 2nd column to the right
		vbx_set_vl(image_width-2);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2);

		// Calculate gradient_y
		// Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row
		vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot);

		// Re-use v_sobel_row_top as v_tmp
		v_tmp = v_sobel_row_top;

		// sum of absoute gradients
		vbx_set_vl(image_width-2);
		vbx(VVHU, VADD, v_tmp, v_gradient_x,  v_gradient_y);
		vbx(SVHU, VSHR, v_tmp, renorm, v_tmp);

		// Threshold
		vbx(SVHU, VSUB,     v_gradient_y, 255, v_tmp);
		vbx(SVHU, VCMV_LTZ, v_tmp,        255, v_gradient_y);

		// Copy the result to the low byte of the output row
		// Trick to copy the low byte (b) to the middle two bytes as well
		// Note that first and last columns are 0
		vbx_set_vl(image_width-2);
		vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp);

		// DMA the result to the output (minus the outside two pixels
		vbx_dma_to_host(output+(y+1)*image_pitch+1, v_row_out+1, (image_width-2)*sizeof(vbx_uword_t));

		// Rotate luma buffers
		tmp_ptr      = (void *)v_luma_top;
		v_luma_top   = v_luma_mid;
		v_luma_mid   = v_luma_bot;
		v_luma_bot   = (vbx_uhalf_t *)tmp_ptr;

		// Rotate v_sobel_row buffers (for gradient_y)
		tmp_ptr         = (void *)v_sobel_row_top;
		v_sobel_row_top = v_sobel_row_mid;
		v_sobel_row_mid = v_sobel_row_bot;
		v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr;
	}

	// Set bottom row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t));

	vbx_sp_pop();

	return VBW_SUCCESS;
}
Пример #26
0
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS )
{
	typedef vbx_mm_t vbx_sp_t;

	int elements = INROWS * INCOLS;

	if(elements < SCALAR_THRESHOLD) {
		vbx_sync();  //in case we input is waiting on a DMA transfer
		int i,j;
		for(i = 0; i < INROWS; i++) {
			for(j = 0; j < INCOLS; j++) {
				out[j*INROWS+i] = in[i*INCOLS+j];
			}
		}
		return VBW_SUCCESS;
	}

	vbx_sp_push();

	vbx_sp_t *v_in;
	vbx_sp_t *v_out;

	int tile_height     = 0;
	int tile_width      = 0;
	int prev_tile_width = 0;
	int tile_y          = 0;
	int tile_x          = 0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes;
	int SP_SIZE = vbx_sp_getfree();
	int max_sp_elements   = vbx_sp_getfree() / sizeof(vbx_sp_t);
	int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t);


	if( INROWS == 1 || INCOLS == 1 ) {           // 1D transpose becomes a simple copy operation
		if( elements <= max_sp_elements ) {      // We can use the whole scratchpad for this
			v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
			vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );
			v_out = v_in;
			vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
		} else {                                 // To test this, you'll need a very large 1D matrix (or a small SP)
			tile_width = max_sp_elements;
			v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) );
			for (tile_x = 0; tile_x < elements; tile_x += tile_width) {
				if( tile_x + tile_width > elements) tile_width = elements - tile_x;
				vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) );
				v_out = v_in;
				vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) );
			}
		}
	} else if( elements < max_tile_elements ) {  // Matrix is small enough to handle entirely in SP
		v_in  = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );

		vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );

		vbw_mtx_xp(v_out,v_in,INROWS,INCOLS);

		vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
	} else {                                     // At this point we know at least one full tile will be needed
		#define QUICK_A_LANES_THRESHOLD 8        // Use merge transpose if there are at least this many lanes
		#define QUICK_A_TILE_WIDTH 128
		#define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH)
		#define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2)
		#define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t)))

		#define QUICK_B_LANES_THRESHOLD 16        // Use smaller merge transpose tile only if there are a lot of lanes
		#define QUICK_B_TILE_WIDTH 64             //     and only if larger tile A size cannot be used.
		#define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH)
		#define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2)
		#define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t)))

		int NUM_LANES = this_mxp->vector_lanes;
		int DMA_BYTES = this_mxp->dma_alignment_bytes;
		int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t);

		vbx_sp_t *v_out_sel;
		vbx_sp_t *vf = 0;

		if( NUM_LANES >= QUICK_A_LANES_THRESHOLD       // Check for appropriate conditions to use merge transpose tiles
					&& INCOLS >= QUICK_A_TILE_WIDTH
					&& INROWS >= QUICK_A_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_A_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD
					&& INCOLS >= QUICK_B_TILE_WIDTH
					&& INROWS >= QUICK_B_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_B_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else {
			findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim );
		}

		prev_tile_width = tile_width;

		v_in  = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );


		if( v_out==NULL ) {
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}

		vbx_sp_t *v[2] = { v_in, v_out };

		tile_y = 0;                              // Reset y position for new col
		while( tile_y < INROWS ) {
		vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
		vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
			tile_x = 0;                          // Reset x position for new row
			while( tile_x < INCOLS ) {

				vbx_dma_to_vector_2D(
						v_in,
						in+(tile_y*INCOLS)+tile_x,
						tile_width*sizeof(vbx_mm_t),
						tile_height,
						tile_width*sizeof(vbx_sp_t),
						INCOLS*sizeof(vbx_mm_t) );

				v_out_sel = v_out;                         // select v_out as default vector to DMA to MM

				/* *** merge transpose (matrix must be square and a power of 2 wide) *** */
				if( vf && tile_width == tile_height
							&& (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) {
					int src = 0;
					int n;
					for( n=1; n<tile_width; n *= 2 ) {     // can't do 1st iteration until entire tile is DMA'd in
						const int nn = 2*n;

						// copy the destination matrix
						vbx_set_vl( tile_width*tile_width );    // use v_in & v_out as working matrices (clobber v_in)
						vbxx(  VMOV, v[!src], v[src]);

						// do the work
						vbx_set_vl( n*tile_width );
						vbxx( VAND, vf, n, (vbx_enum_t*)0 );           // mask for merging: 0101010... then 00110011...
						vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 );
						vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n           , vf );
						vbxx_2D( VCMV_Z, v[!src]+n,            v[src]+n*tile_width, vf );

						src = !src;
					}

					v_out_sel = v[src];     // depending on the size of the mtx, the final result may be in v_in or v_out
				} else {
					vbx_set_vl( 1 );        // 2D and 3D will be set by the x and y edge conditions, even using merge
					vbxx_3D(VMOV, v_out, v_in );
				}

				vbx_dma_to_host_2D(
						out+(tile_x*INROWS)+tile_y,
						v_out_sel,
						tile_height*sizeof(vbx_mm_t),
						tile_width,
						INROWS*sizeof(vbx_mm_t),
						tile_height*sizeof(vbx_sp_t) );

				tile_x += tile_width;                 // Set up width for next tile
				if( tile_x + tile_width > INCOLS ) {  // Temporarily reduce tile width when reaching right edge of matrix
					tile_width = INCOLS - tile_x;
					vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
					vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
				}
			}
			tile_y += tile_height;                    // Set up width and height for next row of tiles
			tile_width = prev_tile_width;             // Restore original tile width for next row of tiles

			/* *** Permanently reduce tile height when reaching bottom of matrix *** */
			tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height;		}
	}
	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}
Пример #27
0
//vector version of rgb converter
void vector_blend(
    output_pointer img_out, input_pointer img_in1, input_pointer img_in2,
    unsigned int num_row, unsigned int num_column, intermediate_type blending_const )
{
    intermediate_type *v_img1[2];
    input_type        *v_img2[2];
    intermediate_type *v_temp;

    intermediate_type blending_const_bar = 256-blending_const;
    int j;

    vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
    const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
    const int VBX_WIDTH_BYTES     = this_mxp->vector_lanes * sizeof(int);
    const int VBX_DMA_ALIGNMENT   = this_mxp->dma_alignment_bytes;

    unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type)));
    chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT );

    unsigned int chunk_size_old    = chunk_size;
    unsigned int vector_length     = chunk_size;
    unsigned int vector_length_old = vector_length;

    v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );
    v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );
    v_img2[0] = (input_type        *)vbx_sp_malloc( chunk_size*sizeof(input_type) );
    v_img2[1] = (input_type        *)vbx_sp_malloc( chunk_size*sizeof(input_type) );
    v_temp    = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );

    if( v_temp == NULL ) {
        VBX_EXIT(0xBADDEAD);
    }

    int bufselect = 0;

    vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) );
    vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) );

    for( j=0; j<num_row*num_column; j+=vector_length_old ) {
        vbx_set_vl(vector_length);

        if( j > 0 ) {
            vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) );
        }

        if( (j+vector_length_old) < (num_row*num_column-1) ) {
            if( (j+vector_length_old*2) >= num_row*num_column ) {
                vector_length =  num_row*num_column - j - vector_length_old;
                chunk_size = vector_length;
            }
            vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) );
            vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) );
        }

        vbx( SVBHU, VMULLO, v_temp,            blending_const,     v_img1[bufselect] );
        vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] );
        vbx( VVHU,  VADD,   v_img1[bufselect], v_img1[bufselect],  v_temp );
        vbx( SVHBU, VSHR,   v_img1[bufselect], 8,                  v_img1[bufselect] );

        bufselect = 1-bufselect;
    }

    vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) );
    vbx_sp_free();
    vbx_sync();
}
Пример #28
0
inline vector_mask_obj::vector_mask_obj(const VBX::Vector<T>& msk)
{
	vbx_set_vl(msk.size);
	constructor(msk);
}
Пример #29
0
Файл: test.c Проект: 8l/mxp
int compare_vbx_lut_to_vbx_lut_ci(int stage, int max_print_errors)
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int vci_lanes = this_mxp->vcustom0_lanes;
    int sz = this_mxp->scratchpad_size/(16*sizeof(vbx_ubyte_t));

    vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    if(v_idx == NULL) {
        printf("failed to allocate in compare_vbx_lut_to_vbx_lut_ci\n");
    }

    unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));
    unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));

    int f, n, s, errors = 0;
    for (n = 0; n < sz; n++) {
        v_pattern[n] = (n & 0xff);
    }

    for (f = 0; f < face_lbp[stage].count; f++) {
        lbp_feat_t feat = face_lbp[stage].feats[f];

        vbx_set_vl(sz);
        int total = f;
        s = 0;
        while(s < stage){
            total += face_lbp[s].count;
            s++;
        }

        if(total < 256) {
            vbx(SVBU, VLBPLUT, v_lutc, total, v_pattern);
        } else {
            vbx(SVBS, VLBPLUT, v_lutc, total-256, v_pattern);
        }

        vbx(SVB, VMOV, v_pass, feat.fail, 0);
        /* check if pattern is in lut */
        vbx(SVBU, VSHR, v_group, 5, v_pattern);
        for (n = 0; n < 8; n++) {
            vbx(SVB, VADD, v_sel, -n, v_group);
            vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel);
        }

        vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern);
        vbx(VVWB, VSHR, v_lut, v_idx, v_lut);
        vbx(SVB, VAND, v_lut, 1, v_lut);
        vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut);

        vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char));
        vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char));
        vbx_sync();

        errors += match_array_byte(lut, lut_c, "custom_lut", sz, 1, 0, max_print_errors, 0, 0);

    }
    vbx_sp_free();
    vbx_shared_free(lut);
    vbx_shared_free(lut_c);
    return errors;
}
Пример #30
0
/* takes in precomputed bfly */
static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real)
{
	int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num;
	short  wr, wi;

	vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp;
	vptr_half v_twr, v_twi;
	vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip;
	vptr_half v_temp;
	vptr_half v_twr2, v_twi2;
	const int n = 1 << m;
	const int half = n >> 1;

	scale = 0;
	mul = 0;
	swap = m >> 1;

	l = m-1;
	flight = 1;
	bfly = half;

	const int INROWS = 1<<swap;
	const int INCOLS = 1<<(m-swap);

	if ( !(m%2) ){
		swap--;
	}

	// allocate space in vector memory for vectors
	v_fr  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	v_twr   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_temp  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	if( v_fr  == NULL || v_fi == NULL  || v_fr2 == NULL || v_fi2== NULL  || \
	    v_twr == NULL || v_twi == NULL || v_temp == NULL) {
	 	VBX_EXIT(-1);
	}

	v_twr2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	if( v_twr2 == NULL || v_twi2 == NULL) {
	 	VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) );

#if 1
        if(real){
            vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse);
        }
#endif

	while (l > swap) {
		if (inverse) {
			// variable scaling, depending upon data
			shift = 0;
			if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) {
				shift = 1;
				scale++;
			}
		} else {
			// fixed scaling, for proper normalization
			// -- overall factor of 1/n, distributed to maximize arithmetic accuracy
			shift = 1;
		}
		// shift will be performed on each data point exactly once during pass

		SWAP( v_fr, v_fr2, v_tmp );
		SWAP( v_fi, v_fi2, v_tmp );

		if (shift){
			vbx_set_vl( n );
			vbx(SVH,VSHR,  v_fr2, 1,  v_fr2 );
			vbx(SVH,VSHR,  v_fi2, 1,  v_fi2 );
		}

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fr[n>>1],  v_fr2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fi2,      v_twi );

		vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp );

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fi[n>>1],  v_fi2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fr2,      v_twi );

		vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VADD,    &v_fi[n>>1], &v_fi[n>>1], v_temp );

		l--;
		mul++;
		flight <<= 1;

		if( l > swap ) {
			vbx_set_vl( 1<<l );
			vbx( VVWH, VMOV, v_twr, v_twr, 0 );
			vbx( VVWH, VMOV, v_twi, v_twi, 0 );
		}
	}

	if ( !(m%2) ) {
		l++;
		flight >>=1;
	}