Beispiel #1
0
/** VBX Motion Estimation, using vbx_3d ops.
 * vbw_mtx_motest_3D_byte_setup should be run prior to running this function.
 * Using bytes as input data. block_height must be an even number.
 *
 * @param[out] result
 * @param[in] x
 * @param[in] y
 * @param[in] m
 * @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_3d_byte(output_type *result, input_type* x, input_type *y, vbw_motest_t *m)
{

	int  l,j;
	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*m->block_width, x+j*m->image_width, m->block_width*sizeof(input_type) );
	}
	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img+j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	vbx_set_3D( m->search_width, m->block_height*sizeof(intermediate_type), sizeof(input_type), 0 );

	for( l = 0; l < m->search_height; l++ ) {
		//Accumulate each row into a vbx of row SADs
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height, sizeof(intermediate_type), sub_block_width*sizeof(input_type), m->block_width*sizeof(input_type) );
		vbx_acc_3D( VVBHU, VABSDIFF, m->v_row_sad, m->v_img+l*sub_block_width, m->v_block );

		//Accumulate the SADs
		vbx_set_vl( m->block_height/2 );
		vbx_set_2D( m->search_width, sizeof(output_type), m->block_height*sizeof(intermediate_type), m->block_height*sizeof(intermediate_type) );
		vbx_acc_2D( VVHWU, VADD, (vbx_uword_t*)m->v_result+l*m->search_width, m->v_row_sad, m->v_row_sad+(m->block_height/2) );

		//Transfer the line to host
		vbx_dma_to_host( result+l*m->search_width, m->v_result+l*m->search_width, m->search_width*sizeof(output_type) );

	}

	return VBW_SUCCESS;
}
Beispiel #2
0
int main(void)
{
	vbx_test_init();

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int required_vectors = 4;

	int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors;

	int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH );

	double scalar_time, vector_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nAdd test...\n" );
	printf( "Vector length: %d\n", N );

	vbx_mm_t *scalar_in1 = malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *scalar_in2 = malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) );

	vbx_mm_t *vector_in1 = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *vector_in2 = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
//	vbx_mm_t *vector_out = vector_in2 - 5;


	vbx_sp_t *v_in1 = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
	vbx_sp_t *v_in2 = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
	vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
//	vbx_sp_t *v_out = v_in2-5;

	VBX_T(test_zero_array)( scalar_out, N );
	VBX_T(test_zero_array)( vector_out, N );

	VBX_T(test_init_array)( scalar_in1, N, 1 );
	VBX_T(test_copy_array)( vector_in1, scalar_in1, N );
	VBX_T(test_init_array)( scalar_in2, N, 1 );
	VBX_T(test_copy_array)( vector_in2, scalar_in2, N );

	VBX_T(test_print_array)( scalar_in1, PRINT_LENGTH );
	VBX_T(test_print_array)( scalar_in2, PRINT_LENGTH );

	scalar_time = test_scalar( scalar_out, scalar_in1, scalar_in2, N );
	VBX_T(test_print_array)( scalar_out, PRINT_LENGTH);

	vbx_dma_to_vector( v_in1, (void *)vector_in1, N*sizeof(vbx_sp_t) );
	vbx_dma_to_vector( v_in2, (void *)vector_in1, N*sizeof(vbx_sp_t) );
	vector_time = test_vector( v_out, v_in1, v_in2, N, scalar_time );
	vbx_dma_to_host( (void *)vector_out, v_out, N*sizeof(vbx_sp_t) );
	vbx_sync();
	VBX_T(test_print_array)( vector_out, PRINT_LENGTH );

	errors += VBX_T(test_verify_array)( scalar_out, vector_out, N );

	VBX_TEST_END(errors);
	return 0;
}
Beispiel #3
0
double test_vector_sp(vbx_mm_t *vector_out, vbx_mm_t  *vector_in1, int IN1ROWS, int IN1COLS, vbx_mm_t  *vector_in2, int IN2ROWS, int IN2COLS, double scalar_time )
{
	typedef vbx_mm_t vbx_sp_t;
	int retval=-1;
	vbx_timestamp_t time_start, time_stop;
	printf( "\nExecuting MXP matrix multiply... src1[%dx%d] src2[%dx%d]\n",IN1ROWS, IN1COLS,IN2ROWS, IN2COLS );

	vbx_timestamp_start();
	time_start = vbx_timestamp();
	vbx_sp_push();
	vbx_sp_t* v_in1=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN1COLS);
	vbx_sp_t* v_in2=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN2ROWS*IN2COLS);
	vbx_sp_t* v_out=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN2COLS);
	if(v_out!=NULL){
		vbx_dma_to_vector(v_in1,vector_in1,sizeof(vbx_sp_t)*IN1ROWS*IN1COLS);
		vbx_dma_to_vector(v_in2,vector_in2,sizeof(vbx_sp_t)*IN2ROWS*IN2COLS);
		retval = vbw_mtx_mul( v_out, v_in1, IN1ROWS, IN1COLS, v_in2, IN2ROWS, IN2COLS );
		vbx_dma_to_host(vector_out,v_out,sizeof(vbx_sp_t)*IN1ROWS*IN2COLS);
		vbx_sync();
	}else{
		printf("not enough sp space for sp test");
	}
	time_stop = vbx_timestamp();
	printf( "...done. retval:0x%08X\n", retval );
	return vbx_print_vector_time( time_start, time_stop, scalar_time );
}
Beispiel #4
0
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int width, int height, int max_print_errors)
{
    int j, errors = 0;
    unsigned char** scalar_patterns = test_scalar_patterns(img, 0, width, height);

    vbx_ubyte_t* v_in = (vbx_ubyte_t*)vbx_sp_malloc(3*width*sizeof(vbx_word_t));
    vbx_ubyte_t* v_top = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_bot = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lbp = v_bot;

    unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    vbx_set_vl(width);
    for(j=0; j < height - 2; j++){
        vbx_dma_to_vector(v_in, img+j*width, 3*width*sizeof(unsigned char));
        vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+width); 
        vbx(VVHU, VCUSTOM1, v_bot, v_in+width, v_in+2*width); 
        vbx(SVHBU, VAND, v_top, 0xf0, v_top);
        vbx(SVHBU, VAND, v_bot, 0x0f, v_bot);
        vbx(VVBU, VADD, v_lbp, v_bot, v_top); 
        vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char));
        vbx_sync();

        errors = match_array_byte(lbp, scalar_patterns[0]+j*width, "custom_lbp", width-2, 1, max_print_errors, 1, j);

    }
    vbx_sp_free();
    vbx_shared_free(lbp);
    return errors;
}
Beispiel #5
0
void vector_rectangle_to_luma(
	pixel *input_buffer,
	vbx_uhalf_t *v_luma_buffer, vbx_uhalf_t *v_row_temp, vbx_uword_t *v_row,
	int startx, int starty, int width, int height, const int image_pitch )
{
	int y;
	vbx_uhalf_t *v_luma;

	vbx_set_vl(width);

	for(y = 0; y < height; y++){
		v_luma = v_luma_buffer+(y*width);
		vbx_dma_to_vector(v_row, input_buffer+((y+starty)*image_pitch)+startx, width*sizeof(vbx_uword_t));

		//Move the b component into v_luma
		vbx(SVWHU, VAND, v_luma, 0xFF, v_row);
		vbx(SVHU,  VMUL, v_luma, 25,   v_luma);

		//Move g into v_row_temp and add it to v_luma
		vbx(SVWHU, VAND, v_row_temp, 0xFF,  (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+1));
		vbx(SVHU,  VMUL, v_row_temp, 129,    v_row_temp);
		vbx(VVHU,  VADD, v_luma,     v_luma, v_row_temp);

		//Move r into v_row_temp and add it to v_luma
		vbx(SVWHU, VAND, v_row_temp, 0xFF,  (vbx_uword_t*)(((vbx_ubyte_t*)v_row)+2));
		vbx(SVHU,  VMUL, v_row_temp, 66,     v_row_temp);
		vbx(VVHU,  VADD, v_luma,     v_luma, v_row_temp);

		//divide by 2^8
		vbx(SVHU,  VSHR, v_luma, 8,  v_luma);
	}
}
Beispiel #6
0
int dma_bandwidth_test()
{
	const int num_iter = 64;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int scratchpad_size = this_mxp->scratchpad_size;

	uint8_t *buf = vbx_shared_malloc(scratchpad_size);
	vbx_ubyte_t *v_buf = vbx_sp_malloc(scratchpad_size);

	vbx_timestamp_t time_start, time_stop;

	int i;
	int len;
	int to_host;
	int errors = 0;

	vbx_mxp_print_params();

	// dma_alignment_bytes gives DMA master data bus width in bytes.
	double bytes_per_sec = \
		(((double) this_mxp->core_freq) * this_mxp->dma_alignment_bytes);
	double max_megabytes_per_sec = bytes_per_sec/(1024*1024);
	printf("\nMax available bandwidth = %s Megabytes/s\n",
	       vbx_eng(max_megabytes_per_sec, 4));

	printf("\n");

	for (to_host = 0; to_host < 2; to_host++) {
		for (len = 32; len <= scratchpad_size ; len *= 2) {
			printf("DMA %s, %d bytes\n", to_host ? "write" : "read", len);
			vbx_timestamp_start();
			if (to_host) {
				time_start = vbx_timestamp();
				for (i = 0; i < num_iter; i++) {
					vbx_dma_to_host(buf, v_buf, len);
				}
				vbx_sync();
				time_stop = vbx_timestamp();
			} else {
				time_start = vbx_timestamp();
				for (i = 0; i < num_iter; i++) {
					vbx_dma_to_vector(v_buf, buf, len);
				}
				vbx_sync();
				time_stop = vbx_timestamp();
			}
			print_dma_bandwidth(time_start, time_stop, len, num_iter,
			                    max_megabytes_per_sec);
			printf("\n");
		}
		printf("\n");
	}

	vbx_shared_free(buf);
	vbx_sp_free();

	return errors;
}
Beispiel #7
0
vbx_mtx_fdct_t *
vbx_mtx_fdct_init( dt *coeff_v, dt *image )
{
	const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE;
	const int num_bytes = BIG_TILE_SIZE * sizeof(dt);
	const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt);

	//compute coeffs matrix in double and truncated to dt
	int i, j;
	double s;
	for (i = 0; i < BLOCK_SIZE; i++) {
		s = (i == 0) ? sqrt(0.125) : 0.5;
		for (j = 0; j < BLOCK_SIZE; j++) {
			c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5));
			cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999);
		}
	}

	vbx_sp_push();

	vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) );

	v->vcoeff    = (vbx_half_t *)vbx_sp_malloc( co_bytes );
	v->vprods    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
#if USE_ACCUM_FLAGS
	v->vaccum    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vflags    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
#endif

	// interleave ordering to ensure no false hazards
	v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes );

	v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	if( !v->vblock[1] ) {
		VBX_PRINTF( "ERROR: out of memory.\n" );
		VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes );

	int row;
	for( row=0; row < BLOCK_SIZE; row++ ) {
		getBigTileImageY(v->vimage[v->db],image,row);
	}
#if USE_ACCUM_FLAGS 
	// create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc
	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
	vbx( SEH, VAND,   v->vflags,       BLOCK_SIZE-1,      0 );
#endif

	return v;
}
Beispiel #8
0
Datei: test.c Projekt: 8l/mxp
double test_vector_power( vbx_word_t *vector_out, vbx_word_t *vector_in1, vbx_word_t *vector_in2, int N, double scalar_time )
{
	int retval;
	vbx_timestamp_t time_start, time_stop;
	printf("\nExecuting MXP vector software power...");
 	vbx_word_t *v_out = vbx_sp_malloc( N*sizeof(vbx_word_t) );
	vbx_word_t *v_in1 = vbx_sp_malloc( N*sizeof(vbx_word_t) );
	vbx_word_t *v_in2 = vbx_sp_malloc( N*sizeof(vbx_word_t) );
	vbx_dma_to_vector( v_in1, vector_in1, N*sizeof(vbx_word_t) );
	vbx_dma_to_vector( v_in2, vector_in2, N*sizeof(vbx_word_t) );
	vbx_timestamp_start();

	time_start = vbx_timestamp();
	retval = vbw_vec_power_word( v_out, v_in1, v_in2, N );
	vbx_sync();
	time_stop = vbx_timestamp();
	vbx_dma_to_host( vector_out, v_out, N*sizeof(vbx_word_t) );
	vbx_sync();

	printf("done. retval:%X\n",retval);
	return vbx_print_vector_time(time_start, time_stop, scalar_time);
}
Beispiel #9
0
/** VBX Motion Estimation.
 *  Similar to the scalar version but scans vertically as it makes it easier to align vectors.
 *  vbw_mtx_motest_byte_setup should be run prior to running this function.
 *
 *  @param[out] result
 *  @param[in] x
 *  @param[in] y
 *  @param[in] m
 *  @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m)
{
	int  j;

	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img  +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	// column-ize the reference block
	vbx_set_vl( m->block_width );
	vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
	vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 );

	//Do column by column

	for( j=0; j < m->search_width; j++ )
	{
		// column-ize the search image
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height+m->search_height,  m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
		vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 );

		// search the image columnwise
		vbx_set_vl( m->block_width*m->block_height );
		vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0,  m->block_width*sizeof(input_type) );
		vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub );
	}

	// Write back result
	vbx_dma_to_host( result, m->v_result, m->result_size );

	return VBW_SUCCESS;
}
Beispiel #10
0
Datei: test.c Projekt: 8l/mxp
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int log, int width, int height, int max_print_errors)
{
    int j, l, cell, max_cell, errors = 0;
    unsigned char** scalar_patterns = test_scalar_patterns(img, log, width, height);

    max_cell = 1<<log;
    vbx_uhalf_t* v_in = (vbx_uhalf_t*)vbx_sp_malloc((1+2*max_cell)*width*sizeof(vbx_half_t));
    vbx_uhalf_t* v_top = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t));
    vbx_uhalf_t* v_bot = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t));
    vbx_ubyte_t* v_lbp = (vbx_ubyte_t*)v_bot;

    unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    vbx_set_vl(width);
    for(l = 0; l < 1; l++){
        cell = 1<<l;
        for(j=0; j < height - 2*cell; j++){
            vbx_dma_to_vector(v_in, img+j*width, (1+2*cell)*width*sizeof(unsigned short));
            vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+(1*cell)*width); 
            vbx(VVHU, VCUSTOM1, v_bot, v_in+(1*cell)*width, v_in+(2*cell)*width); 
            vbx(SVHBU, VAND, (vbx_ubyte_t*)v_top, 0xf0, v_top);
            vbx(SVHBU, VAND, (vbx_ubyte_t*)v_bot, 0x0f, v_bot);
            vbx(VVBU, VADD, v_lbp, v_bot, v_top); 
            vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char));
            vbx_sync();

            errors += match_array_byte(lbp, scalar_patterns[l]+j*width, "custom_lbp", width-2*cell, 1, 0, max_print_errors, 1, j);
            if (errors > max_print_errors){
                max_print_errors = 0;
            }

        }
    }
    vbx_sp_free();
    vbx_shared_free(lbp);
    return errors;
}
Beispiel #11
0
Datei: test.c Projekt: 8l/mxp
//FIXME stride for match not implemented
int compare_LBPPassStage_to_restricted(unsigned short *vbx_img, int log, lbp_stage_t lbp_stage, int window, int width, int height, int max_print_errors)
{
    int l, i, j, cell, errors = 0;

    unsigned char** scalar_patterns = test_scalar_patterns(vbx_img, log, width, height);

    unsigned char *pass, *vbx_pass;
    pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char));
    vbx_pass = (unsigned char*)vbx_shared_malloc(width*height*sizeof(unsigned char));
    
    vbx_byte_t** v_lbp =(vbx_byte_t**)vbx_shared_malloc((log+1)*sizeof(vbx_byte_t*));
    for (l=0; l<log+1; l++) {
        v_lbp[l] = (vbx_byte_t*)vbx_sp_malloc((window+1)*width*sizeof(vbx_byte_t)); 
    }
    vbx_byte_t* v_lut = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); 
    vbx_byte_t* v_stage = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); 
    vbx_byte_t* v_pattern;
    lbp_feat_t feat;
    int dx, dy, dw, f;

    for (l=0; l<log+1; l++) {
        vbx_dma_to_vector(v_lbp[l]+width, scalar_patterns[l], (window)*width*sizeof(unsigned char));
    }
    vbx_sync();
    for(j=0; j < height-(window+1); j++) {
        for (l=0; l<log+1; l++) {
            vbx_set_vl(width * window);
            vbx(VVB, VMOV, v_lbp[l], v_lbp[l]+width, NULL);
            vbx_dma_to_vector(v_lbp[l] + window*width, scalar_patterns[l]+(j+window)*width, width*sizeof(unsigned char));
        }

        vbx_set_vl(width-(window+1));
        vbx(SVB, VMOV, v_stage, 0, NULL);
        for (f = 0; f < lbp_stage.count; f++) {
            feat = lbp_stage.feats[f];
            dx = feat.pos.src.x;
            dy = feat.pos.src.y;
            dw = feat.pos.size.x;
            v_pattern = v_lbp[dw>>1]+(dy*width+dx);

            vbx(SVBU, VLBPLUT, v_lut, f, v_pattern);
            vbx(VVB, VADD, v_stage, v_stage, v_lut);
        }
        vbx(SVB, VMOV, v_lut, 0, NULL);
        vbx(SVB, VCMV_GEZ, v_lut, 1, v_stage);
        vbx_dma_to_host(vbx_pass + j*width, v_lut, (width-(window+1))*sizeof(unsigned char));
        vbx_sync();
    }


    unsigned int *iImg, *iiImg;
    iImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int));
    iiImg = (unsigned int *)vbx_shared_malloc(width*height*sizeof(unsigned int));

    gen_integrals(vbx_img, iImg, iiImg, width, height);

    image_t lbp_img = {iImg, {width, height}};
    for (j = 0; j < height - (window + 1); j++) {
        for (i = 0; i < width - (window + 1); i++) {
            pair_t lbp_p = {i, j};
            pass[j*width+i] = LBPPassStage(lbp_img, lbp_stage, lbp_p);
        }
    }

    /* test pass vs vbx pass */
    for (j = 0; j < height - (window + 1); j++) {
        errors += match_array_byte(vbx_pass + j*width, pass + j*width, "pass stage", width - (window + 1), 1, 0, max_print_errors, 1, j);
        if (errors > max_print_errors){
            max_print_errors = 0;
        }
    }
    return errors;
}
Beispiel #12
0
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N )
{

	typedef vbx_mm_t vbx_sp_t;
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;
	// Catch when N is very small
	if( N<4 ) {
		unsigned int i = 0;
		while(i<N) {
			dst[N-i-1]=src[i];
			i++;
		}
		return VBW_SUCCESS;
	}

	vbx_mxp_t *this_mxp          = VBX_GET_THIS_MXP();
	unsigned int SP_WIDTH_B      = this_mxp->scratchpad_alignment_bytes;
	unsigned int FREE_BYTES      = vbx_sp_getfree();


	// Catch when N is small enough that cached scalar does a better job
	if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){
		unsigned int i;
		vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t));
		vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t));
		for( i=0; i<N; i++ ) {
			B[N-i-1]=A[i];
		}
		vbx_dcache_flush(B,N*sizeof(vbx_mm_t));
		return VBW_SUCCESS;
	}

	unsigned int NUM_LANES   = this_mxp->vector_lanes;
	unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B);
	unsigned int tile_size_w = tile_size_b/4;
	unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T;


	unsigned int num_tiles = N / tile_size_t;
	unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B;

	unsigned int tile_part_t = N - num_tiles * tile_size_t;
	unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
		NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
		NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX;


	if(tile_part_t){
		vbx_sp_push();
		vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t));
		vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t));

#if !VBX_SKIP_ALL_CHECKS
		if( !v_0 || !v_1) {
			VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
			VBX_EXIT(-1);
		}
#endif

		vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t));
		vbw_vec_reverse(v_1, v_0, tile_part_t);
		vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t));
		dst += tile_part_t;
		vbx_sp_pop();
	}

	if(!num_tiles) {
		return VBW_SUCCESS;
	}

	vbx_sp_push();
	vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B);
	vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) };
	vbx_word_t *result;

#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[0] || !v_scratch[1] || !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	src += (num_tiles - 1) * tile_size_t;

	if( tile_size_w <= threshold_w) {
		while( num_tiles ) {
			vbx_dma_to_vector( v_scratch[0], src, tile_size_b );
			if(VBW_ROT16){
				vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w);
			}else{
				vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w);
			}
			if( VBW_ROT8){
				vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 );
			}
			vbx_dma_to_host( dst, v_scratch[1], tile_size_b );
			dst += tile_size_t;
			src -= tile_size_t;
			num_tiles--;
		}
	} else {
		while( num_tiles ) {
			vbx_dma_to_vector( v_scratch[0], src, tile_size_b );
			result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B,
			                          rows_per_tile, VBW_ROT16 );
			if(VBW_ROT8){
				vec_rot8_h( result, result, tile_size_w*2 );
			}
			vbx_dma_to_host( dst, result, tile_size_b );
			dst += tile_size_t;
			src -= tile_size_t;
			num_tiles--;
		}
	}

	vbx_sp_pop();
	return VBW_SUCCESS;
}
Beispiel #13
0
//vector version of rgb converter
void vector_blend(
    output_pointer img_out, input_pointer img_in1, input_pointer img_in2,
    unsigned int num_row, unsigned int num_column, intermediate_type blending_const )
{
    intermediate_type *v_img1[2];
    input_type        *v_img2[2];
    intermediate_type *v_temp;

    intermediate_type blending_const_bar = 256-blending_const;
    int j;

    vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
    const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
    const int VBX_WIDTH_BYTES     = this_mxp->vector_lanes * sizeof(int);
    const int VBX_DMA_ALIGNMENT   = this_mxp->dma_alignment_bytes;

    unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type)));
    chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT );

    unsigned int chunk_size_old    = chunk_size;
    unsigned int vector_length     = chunk_size;
    unsigned int vector_length_old = vector_length;

    v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );
    v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );
    v_img2[0] = (input_type        *)vbx_sp_malloc( chunk_size*sizeof(input_type) );
    v_img2[1] = (input_type        *)vbx_sp_malloc( chunk_size*sizeof(input_type) );
    v_temp    = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );

    if( v_temp == NULL ) {
        VBX_EXIT(0xBADDEAD);
    }

    int bufselect = 0;

    vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) );
    vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) );

    for( j=0; j<num_row*num_column; j+=vector_length_old ) {
        vbx_set_vl(vector_length);

        if( j > 0 ) {
            vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) );
        }

        if( (j+vector_length_old) < (num_row*num_column-1) ) {
            if( (j+vector_length_old*2) >= num_row*num_column ) {
                vector_length =  num_row*num_column - j - vector_length_old;
                chunk_size = vector_length;
            }
            vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) );
            vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) );
        }

        vbx( SVBHU, VMULLO, v_temp,            blending_const,     v_img1[bufselect] );
        vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] );
        vbx( VVHU,  VADD,   v_img1[bufselect], v_img1[bufselect],  v_temp );
        vbx( SVHBU, VSHR,   v_img1[bufselect], 8,                  v_img1[bufselect] );

        bufselect = 1-bufselect;
    }

    vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) );
    vbx_sp_free();
    vbx_sync();
}
Beispiel #14
0
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS )
{
	typedef vbx_mm_t vbx_sp_t;

	int elements = INROWS * INCOLS;

	if(elements < SCALAR_THRESHOLD) {
		vbx_sync();  //in case we input is waiting on a DMA transfer
		int i,j;
		for(i = 0; i < INROWS; i++) {
			for(j = 0; j < INCOLS; j++) {
				out[j*INROWS+i] = in[i*INCOLS+j];
			}
		}
		return VBW_SUCCESS;
	}

	vbx_sp_push();

	vbx_sp_t *v_in;
	vbx_sp_t *v_out;

	int tile_height     = 0;
	int tile_width      = 0;
	int prev_tile_width = 0;
	int tile_y          = 0;
	int tile_x          = 0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes;
	int SP_SIZE = vbx_sp_getfree();
	int max_sp_elements   = vbx_sp_getfree() / sizeof(vbx_sp_t);
	int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t);


	if( INROWS == 1 || INCOLS == 1 ) {           // 1D transpose becomes a simple copy operation
		if( elements <= max_sp_elements ) {      // We can use the whole scratchpad for this
			v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
			vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );
			v_out = v_in;
			vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
		} else {                                 // To test this, you'll need a very large 1D matrix (or a small SP)
			tile_width = max_sp_elements;
			v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) );
			for (tile_x = 0; tile_x < elements; tile_x += tile_width) {
				if( tile_x + tile_width > elements) tile_width = elements - tile_x;
				vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) );
				v_out = v_in;
				vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) );
			}
		}
	} else if( elements < max_tile_elements ) {  // Matrix is small enough to handle entirely in SP
		v_in  = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );

		vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );

		vbw_mtx_xp(v_out,v_in,INROWS,INCOLS);

		vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
	} else {                                     // At this point we know at least one full tile will be needed
		#define QUICK_A_LANES_THRESHOLD 8        // Use merge transpose if there are at least this many lanes
		#define QUICK_A_TILE_WIDTH 128
		#define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH)
		#define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2)
		#define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t)))

		#define QUICK_B_LANES_THRESHOLD 16        // Use smaller merge transpose tile only if there are a lot of lanes
		#define QUICK_B_TILE_WIDTH 64             //     and only if larger tile A size cannot be used.
		#define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH)
		#define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2)
		#define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t)))

		int NUM_LANES = this_mxp->vector_lanes;
		int DMA_BYTES = this_mxp->dma_alignment_bytes;
		int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t);

		vbx_sp_t *v_out_sel;
		vbx_sp_t *vf = 0;

		if( NUM_LANES >= QUICK_A_LANES_THRESHOLD       // Check for appropriate conditions to use merge transpose tiles
					&& INCOLS >= QUICK_A_TILE_WIDTH
					&& INROWS >= QUICK_A_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_A_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD
					&& INCOLS >= QUICK_B_TILE_WIDTH
					&& INROWS >= QUICK_B_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_B_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else {
			findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim );
		}

		prev_tile_width = tile_width;

		v_in  = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );


		if( v_out==NULL ) {
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}

		vbx_sp_t *v[2] = { v_in, v_out };

		tile_y = 0;                              // Reset y position for new col
		while( tile_y < INROWS ) {
		vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
		vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
			tile_x = 0;                          // Reset x position for new row
			while( tile_x < INCOLS ) {

				vbx_dma_to_vector_2D(
						v_in,
						in+(tile_y*INCOLS)+tile_x,
						tile_width*sizeof(vbx_mm_t),
						tile_height,
						tile_width*sizeof(vbx_sp_t),
						INCOLS*sizeof(vbx_mm_t) );

				v_out_sel = v_out;                         // select v_out as default vector to DMA to MM

				/* *** merge transpose (matrix must be square and a power of 2 wide) *** */
				if( vf && tile_width == tile_height
							&& (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) {
					int src = 0;
					int n;
					for( n=1; n<tile_width; n *= 2 ) {     // can't do 1st iteration until entire tile is DMA'd in
						const int nn = 2*n;

						// copy the destination matrix
						vbx_set_vl( tile_width*tile_width );    // use v_in & v_out as working matrices (clobber v_in)
						vbxx(  VMOV, v[!src], v[src]);

						// do the work
						vbx_set_vl( n*tile_width );
						vbxx( VAND, vf, n, (vbx_enum_t*)0 );           // mask for merging: 0101010... then 00110011...
						vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 );
						vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n           , vf );
						vbxx_2D( VCMV_Z, v[!src]+n,            v[src]+n*tile_width, vf );

						src = !src;
					}

					v_out_sel = v[src];     // depending on the size of the mtx, the final result may be in v_in or v_out
				} else {
					vbx_set_vl( 1 );        // 2D and 3D will be set by the x and y edge conditions, even using merge
					vbxx_3D(VMOV, v_out, v_in );
				}

				vbx_dma_to_host_2D(
						out+(tile_x*INROWS)+tile_y,
						v_out_sel,
						tile_height*sizeof(vbx_mm_t),
						tile_width,
						INROWS*sizeof(vbx_mm_t),
						tile_height*sizeof(vbx_sp_t) );

				tile_x += tile_width;                 // Set up width for next tile
				if( tile_x + tile_width > INCOLS ) {  // Temporarily reduce tile width when reaching right edge of matrix
					tile_width = INCOLS - tile_x;
					vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
					vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
				}
			}
			tile_y += tile_height;                    // Set up width and height for next row of tiles
			tile_width = prev_tile_width;             // Restore original tile width for next row of tiles

			/* *** Permanently reduce tile height when reaching bottom of matrix *** */
			tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height;		}
	}
	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}
Beispiel #15
0
int vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int start_x, int start_y, int reset, const int image_width, const int image_height, const int image_pitch)
{
	int y, x, starty, startx;
	unsigned int sad, sad_min, y_min, x_min;
	vbx_uhalf_t *v_search_luma, *v_last_luma;
	vbx_uhalf_t *v_row_temp;
	vbx_uword_t *v_row;
	vbx_uword_t *v_sad;
	pixel color;

	if(*last_luma == NULL || reset){
		init_vector_motest(input_buffer, last_luma, motest_x, motest_y, start_x, start_y, image_pitch);
	}

	v_search_luma = vbx_sp_malloc( MOTEST_BUFFER_SIZE  * sizeof(vbx_uhalf_t) );
	v_last_luma   = vbx_sp_malloc( MOTEST_BLOCK_SIZE   * sizeof(vbx_uhalf_t) );
	v_row_temp    = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uhalf_t) );
	v_row         = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uword_t) );
	v_sad         = vbx_sp_malloc( MOTEST_SEARCH_SIZE  * sizeof(vbx_uword_t) );

	if(v_sad == NULL){
		printf("Not enough scratchpad for motest\n");
		while(1);
	}

	startx = *motest_x-(MOTEST_SEARCH_WIDTH/2);
	starty = *motest_y-(MOTEST_SEARCH_HEIGHT/2);
	if(startx < 0){
		startx = 0;
	}
	if(startx > image_width-MOTEST_BUFFER_WIDTH){
		startx = image_width-MOTEST_BUFFER_WIDTH;
	}
	if(starty < 0){
		starty = 0;
	}
	if(starty > image_height-MOTEST_BUFFER_HEIGHT){
		starty = image_height-MOTEST_BUFFER_HEIGHT;
	}

	vector_rectangle_to_luma(input_buffer, v_search_luma, v_row_temp, v_row, startx, starty, MOTEST_BUFFER_WIDTH, MOTEST_BUFFER_HEIGHT, image_pitch);
	vbx_dma_to_vector(v_last_luma, *last_luma, MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t));

	//Vector compute sad here

	vbx_set_2D(MOTEST_BLOCK_HEIGHT, sizeof(vbx_uword_t), MOTEST_BUFFER_WIDTH*sizeof(vbx_uhalf_t), MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t));

	for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){
		for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){
			vbx_set_vl(MOTEST_BLOCK_WIDTH);
			vbx_acc_2D(VVHWU, VABSDIFF, v_row, v_search_luma+(y*MOTEST_BUFFER_WIDTH)+x, v_last_luma);
			vbx_set_vl(MOTEST_BLOCK_HEIGHT/2);
			vbx_acc(VVWU, VADD, v_sad+(y*MOTEST_SEARCH_WIDTH)+x, v_row, v_row+MOTEST_BLOCK_HEIGHT/2);
		}

#if TOUCHSCREEN
#ifdef TOUCH_INTERRUPTS_VBX
		if (touchscreen_get_pen(pTouch)) {
			vbx_sp_free();
			return -1;
		}
#endif
#endif
	}

	vbx_sync();

	sad_min = INT_MAX;
	y_min = *motest_y;
	x_min = *motest_x;

	for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){
		for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){
			sad = v_sad[y*MOTEST_SEARCH_WIDTH+x];
			if(sad < sad_min){
				sad_min = sad;
				x_min = x+startx;
				y_min = y+starty;
			} else if(sad == sad_min) {
				if( (abs( x             - MOTEST_SEARCH_WIDTH/2) + abs( y             - MOTEST_SEARCH_HEIGHT/2)) <
				    (abs((x_min-startx) - MOTEST_SEARCH_WIDTH/2) + abs((y_min-starty) - MOTEST_SEARCH_HEIGHT/2))) {
					x_min = x+startx;
					y_min = y+starty;
				}
			}
		}
	}

	color.r = 0;
	color.g = 255;
	color.b = 0;
	color.a = 0;
	scalar_draw_line(*motest_x+(MOTEST_BLOCK_WIDTH/2), *motest_y+(MOTEST_BLOCK_HEIGHT/2), x_min+(MOTEST_BLOCK_WIDTH/2), y_min+(MOTEST_BLOCK_HEIGHT/2), color, input_buffer, image_pitch);

	*motest_y = y_min;
	*motest_x = x_min;

	vbx_set_vl(MOTEST_BLOCK_WIDTH);
	for(y = 0; y < MOTEST_BLOCK_HEIGHT; y++){
		vbx(VVHU, VMOV, v_last_luma+(y*MOTEST_BLOCK_WIDTH), v_search_luma+((y+y_min-starty)*MOTEST_BUFFER_WIDTH)+(x_min-startx), 0);
	}
	vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type));

	draw_motest(input_buffer, *motest_x, *motest_y, image_pitch);
	//simple hack to draw thicker
	draw_motest(input_buffer, *motest_x+1, *motest_y+1, image_pitch);

	vbx_sp_free();
	return 0;
}
Beispiel #16
0
Datei: test.c Projekt: 8l/mxp
int test_lbp_ci(unsigned short* img, int width, int height)
{

    vbx_uhalf_t* v_a1  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b1  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_1h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_uhalf_t* v_a2  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b2  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_2h  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_uhalf_t* v_a4  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b4  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_4h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_ubyte_t* v_1b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));
    vbx_ubyte_t* v_2b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));
    vbx_ubyte_t* v_4b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));

    unsigned short* lbp1h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));
    unsigned short* lbp2h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));
    unsigned short* lbp4h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));

    unsigned char* lbp1b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));
    unsigned char* lbp2b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));
    unsigned char* lbp4b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    img = img + width;

    vbx_dma_to_vector(v_a1, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b1, img + width, width*sizeof(unsigned short));
    vbx_dma_to_vector(v_a2, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b2, img + width, width*sizeof(unsigned short));
    vbx_dma_to_vector(v_a4, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b4, img + width, width*sizeof(unsigned short));
    vbx_sync();

    int i;
    int m = 48;
    for(i=0; i<m; i++){
        v_a1[i] = 0;
        v_b1[i] = 0;
        v_a2[i] = 0;
        v_b2[i] = 0;
        v_a4[i] = 0;
        v_b4[i] = 0;
    }
    int n = 12;
    int src_a1[] = {0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    int src_b1[] = {0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    int src_a2[] = {0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    int src_b2[] = {0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    int src_a4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0};
    int src_b4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0};
    
    for(i=0; i<16; i++){
        v_a1[i] = src_a1[i];
        v_b1[i] = src_b1[i];
        v_a2[i] = src_a2[i];
        v_b2[i] = src_b2[i];
        v_a4[i] = src_a4[i];
        v_b4[i] = src_b4[i];
    }

    vbx_set_vl(width);
    vbx(VVHU, VCUSTOM1, v_1h, v_a1, v_b1); 
    vbx(VVHU, VCUSTOM2, v_2h, v_a2, v_b2); 
    vbx(VVHU, VCUSTOM3, v_4h, v_a4, v_b4); 
    vbx(VVHB, VADD, v_1b, v_1h, ((vbx_byte_t*)v_1h) + 1);
    vbx(VVHB, VADD, v_2b, v_2h, ((vbx_byte_t*)v_2h) + 1);
    vbx(VVHB, VADD, v_4b, v_4h, ((vbx_byte_t*)v_4h) + 1);
    vbx_dma_to_host(lbp1h, v_1h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp2h, v_2h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp4h, v_4h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp1b, v_1b, width*sizeof(unsigned char));
    vbx_dma_to_host(lbp2b, v_2b, width*sizeof(unsigned char));
    vbx_dma_to_host(lbp4b, v_4b, width*sizeof(unsigned char));
    vbx_sync();

    test_print_array_half(v_a1, n);
    test_print_array_half(v_b1, n);
    test_print_hex_array_half(lbp1h, n);
    test_print_hex_array_byte(lbp1b, n);

    test_print_array_half(v_a2, n);
    test_print_array_half(v_b2, n);
    test_print_hex_array_half(lbp2h, n);
    test_print_hex_array_byte(lbp2b, n);

    test_print_array_half(v_a4, n);
    test_print_array_half(v_b4, n);
    test_print_hex_array_half(lbp4h, n);
    test_print_hex_array_byte(lbp4b, n);

    vbx_sp_free();
    vbx_shared_free(lbp1h);
    vbx_shared_free(lbp2h);
    vbx_shared_free(lbp4h);
    vbx_shared_free(lbp1b);
    vbx_shared_free(lbp2b);
    vbx_shared_free(lbp4b);
    return 0;
}
Beispiel #17
0
/* takes in precomputed bfly */
static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real)
{
	int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num;
	short  wr, wi;

	vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp;
	vptr_half v_twr, v_twi;
	vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip;
	vptr_half v_temp;
	vptr_half v_twr2, v_twi2;
	const int n = 1 << m;
	const int half = n >> 1;

	scale = 0;
	mul = 0;
	swap = m >> 1;

	l = m-1;
	flight = 1;
	bfly = half;

	const int INROWS = 1<<swap;
	const int INCOLS = 1<<(m-swap);

	if ( !(m%2) ){
		swap--;
	}

	// allocate space in vector memory for vectors
	v_fr  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	v_twr   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_temp  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	if( v_fr  == NULL || v_fi == NULL  || v_fr2 == NULL || v_fi2== NULL  || \
	    v_twr == NULL || v_twi == NULL || v_temp == NULL) {
	 	VBX_EXIT(-1);
	}

	v_twr2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	if( v_twr2 == NULL || v_twi2 == NULL) {
	 	VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) );

#if 1
        if(real){
            vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse);
        }
#endif

	while (l > swap) {
		if (inverse) {
			// variable scaling, depending upon data
			shift = 0;
			if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) {
				shift = 1;
				scale++;
			}
		} else {
			// fixed scaling, for proper normalization
			// -- overall factor of 1/n, distributed to maximize arithmetic accuracy
			shift = 1;
		}
		// shift will be performed on each data point exactly once during pass

		SWAP( v_fr, v_fr2, v_tmp );
		SWAP( v_fi, v_fi2, v_tmp );

		if (shift){
			vbx_set_vl( n );
			vbx(SVH,VSHR,  v_fr2, 1,  v_fr2 );
			vbx(SVH,VSHR,  v_fi2, 1,  v_fi2 );
		}

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fr[n>>1],  v_fr2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fi2,      v_twi );

		vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp );

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fi[n>>1],  v_fi2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fr2,      v_twi );

		vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VADD,    &v_fi[n>>1], &v_fi[n>>1], v_temp );

		l--;
		mul++;
		flight <<= 1;

		if( l > swap ) {
			vbx_set_vl( 1<<l );
			vbx( VVWH, VMOV, v_twr, v_twr, 0 );
			vbx( VVWH, VMOV, v_twi, v_twi, 0 );
		}
	}

	if ( !(m%2) ) {
		l++;
		flight >>=1;
	}
Beispiel #18
0
Datei: test.c Projekt: 8l/mxp
int main(void)
{
	vbx_test_init();

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int required_vectors = 4;

	int N = VBX_PAD_DN(VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors, this_mxp->scratchpad_alignment_bytes);

	int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH );

	double scalar_time, vector_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nVector copy test...\n" );
	printf( "Vector length: %d\n", N );

	vbx_mm_t *scalar_in  = malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) );

	vbx_mm_t *vector_in  = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) );

	vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
	vbx_sp_t *v_in = vbx_sp_malloc( N*sizeof(vbx_sp_t) );

	VBX_T(test_zero_array)( scalar_in, N );
	VBX_T(test_zero_array)( vector_in, N );

	VBX_T(test_init_array)( scalar_in, N, 1 );
	VBX_T(test_copy_array)( vector_in, scalar_in, N );

	scalar_time = test_scalar( scalar_out, scalar_in, N );
	VBX_T(test_print_array)( scalar_out, PRINT_LENGTH );

	vbx_dma_to_vector( v_in, vector_in, N*sizeof(vbx_sp_t) );
	vector_time = test_vector( v_out, v_in, N, scalar_time );
	vbx_dma_to_host(vector_out, v_out, N*sizeof(vbx_sp_t) );
	vbx_sync();
	VBX_T(test_print_array)( vector_out, PRINT_LENGTH );

	errors += VBX_T(test_verify_array)( scalar_out, vector_out, N );

	vbx_sp_free();

#if TEST_DEEP_SP
	errors += deep_vector_copy_test();
#endif

#if DEBUG_MAKE_SP_FULL
	vbx_sp_malloc(vbx_sp_getfree());
#endif

#if TEST_DEEP_MM
	errors += deep_vector_copy_ext_test();
#endif

	VBX_TEST_END(errors);

	return 0;
}