예제 #1
0
/** VBX Motion Estimation, using vbx_3d ops.
 * vbw_mtx_motest_3D_byte_setup should be run prior to running this function.
 * Using bytes as input data. block_height must be an even number.
 *
 * @param[out] result
 * @param[in] x
 * @param[in] y
 * @param[in] m
 * @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_3d_byte(output_type *result, input_type* x, input_type *y, vbw_motest_t *m)
{

	int  l,j;
	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*m->block_width, x+j*m->image_width, m->block_width*sizeof(input_type) );
	}
	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img+j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	vbx_set_3D( m->search_width, m->block_height*sizeof(intermediate_type), sizeof(input_type), 0 );

	for( l = 0; l < m->search_height; l++ ) {
		//Accumulate each row into a vbx of row SADs
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height, sizeof(intermediate_type), sub_block_width*sizeof(input_type), m->block_width*sizeof(input_type) );
		vbx_acc_3D( VVBHU, VABSDIFF, m->v_row_sad, m->v_img+l*sub_block_width, m->v_block );

		//Accumulate the SADs
		vbx_set_vl( m->block_height/2 );
		vbx_set_2D( m->search_width, sizeof(output_type), m->block_height*sizeof(intermediate_type), m->block_height*sizeof(intermediate_type) );
		vbx_acc_2D( VVHWU, VADD, (vbx_uword_t*)m->v_result+l*m->search_width, m->v_row_sad, m->v_row_sad+(m->block_height/2) );

		//Transfer the line to host
		vbx_dma_to_host( result+l*m->search_width, m->v_result+l*m->search_width, m->search_width*sizeof(output_type) );

	}

	return VBW_SUCCESS;
}
예제 #2
0
int vbw_mtx_xp(vbx_sp_t *v_dst, vbx_sp_t *v_src, const int INROWS, const int INCOLS )
{
	vbx_set_vl( 1 );
	vbx_set_2D(  INCOLS, INROWS*sizeof(vbx_sp_t),       sizeof(vbx_sp_t), 0 );
	vbx_set_3D( INROWS,        sizeof(vbx_sp_t), INCOLS*sizeof(vbx_sp_t), 0 );
	vbxx_3D( VMOV, v_dst, v_src);
	return VBW_SUCCESS;
}
예제 #3
0
파일: vbw_vec_rev.cpp 프로젝트: 8l/mxp
/** Internal helper function to reverse and optionally rotate a vector of words *in the scratchpad*.
 *  This function uses a merge reverse algorithm that is faster on large vectors.
 *  @pre v_src contains the elements to reverse.
 *  @pre v_src, v_scratch0, and v_scratch1 must all be the same length.
 *  @pre v_scratch1 and v_src must not overlap.
 *  @pre v_src *may* overlap v_scratch0 (will clobber v_src).
 *  @pre MXP must be 2 lanes or more.
 *  @pre N is a multiple of SP_WIDTH_B.
 *  @pre NUM_ROWS == N*4 / SP_WIDTH_B.
 *  @pre v_mask must be SP_WIDTH_B bytes long.
 *  @post v_scratch0 and v_scratch1 contents are modified, with one containing the result.
 *  @post v_src clobbered only if v_src overlaps v_scratch0.
 *
 *  @param[in]  v_scratch1 *in scratch*.
 *  @param[in]  v_src *in scratch*.
 *  @param[in]  N is the number of words to reverse.
 *  @param[in]  v_scratch0 *in scratch*.
 *  @param[in]  v_mask *in scratch*.
 *  @param[in]  SP_WIDTH_B typically the scratchpad width in bytes, it is the length of the data to be worked on at a time.
 *  @param[in]  NUM_ROWS is the number of rows of length SP_WIDTH_B bytes.
 *  @param[in]  rot16 TRUE to swap upper and lower half-words of each word in result.
 *  @returns    the scratchpad address where the result resides. This will be equal to either v_scratch0 or v_scratch1,
 *              and will depend on log2(MXP vector lanes).
 */
static vbx_word_t *vec_rev_merge_w( vbx_word_t *v_scratch1, vbx_word_t *v_src, const unsigned int N, vbx_word_t *v_scratch0,
                                    vbx_word_t *v_mask, const unsigned int SP_WIDTH_B, const unsigned int NUM_ROWS, const unsigned int rot16 )
{
#if !VBX_SKIP_ALL_CHECKS
	if( !N || !v_scratch0 || !v_src || !v_scratch1 || !v_mask || SP_WIDTH_B < 8) {
		VBX_PRINTF("Helper function vec_rev_merge_w: null pointer or row length (vector lanes) too short.");
		VBX_EXIT(-1);
	}
#endif

	vbx_word_t *v_scratch[2] = { v_scratch0, v_scratch1 };
	unsigned int W = SP_WIDTH_B/4/2;                                         // half the number of words in a row
	unsigned int sel = 1;

	if( rot16 ) {
		vbx_set_vl( W );
		vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, 0, SP_WIDTH_B );
		vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-W), 16, (vbx_uword_t *)v_src );
		vbx_2D( SVWU, VROTL, (vbx_uword_t *)(v_scratch[sel]+N-(W*2)), 16, (vbx_uword_t *)(v_src+W) );
	} else {
		vbx_set_vl( W );
		vbx_set_2D( NUM_ROWS, -SP_WIDTH_B, SP_WIDTH_B, 0 );
		vbx_2D( VVW, VMOV, v_scratch[sel]+N-W, v_src, 0 );
		vbx_2D( VVW, VMOV, v_scratch[sel]+N-(W*2), v_src+W, 0 );
	}

	vbx_set_vl( SP_WIDTH_B/4 );
	vbx_set_2D( NUM_ROWS, SP_WIDTH_B, SP_WIDTH_B, 0 );

	while( W > 1 ) {
		// set up odd/even mask register
		W /= 2;
		vbx( SEW, VAND, v_mask, W, 0 );
		vbx_2D( VVW, VCMV_NZ, v_scratch[!sel], v_scratch[sel]-W, v_mask );
		vbx_2D( VVW, VCMV_Z , v_scratch[!sel], v_scratch[sel]+W, v_mask );
		sel = !sel;
	}

	return v_scratch[sel];
}
예제 #4
0
/** VBX Motion Estimation.
 *  Similar to the scalar version but scans vertically as it makes it easier to align vectors.
 *  vbw_mtx_motest_byte_setup should be run prior to running this function.
 *
 *  @param[out] result
 *  @param[in] x
 *  @param[in] y
 *  @param[in] m
 *  @returns negative on error condition. See vbw_exit_codes.h
 */
int vbw_mtx_motest_byte(output_type *result, input_type *x, input_type *y, vbw_motest_t *m)
{
	int  j;

	int sub_block_width      = m->block_width+m->search_width;

	for( j = 0; j < m->block_height; j++ ) {
		vbx_dma_to_vector( m->v_block+j*sub_block_width, x+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	for( j = 0; j < m->block_height+m->search_height; j++ ) {
		vbx_dma_to_vector( m->v_img  +j*sub_block_width, y+j*m->image_width, sub_block_width*sizeof(input_type) );
	}

	// column-ize the reference block
	vbx_set_vl( m->block_width );
	vbx_set_2D( m->block_height, m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
	vbx_2D( VVB, VMOV, (vbx_byte_t*)m->v_block, (vbx_byte_t*)m->v_block, 0 );

	//Do column by column

	for( j=0; j < m->search_width; j++ )
	{
		// column-ize the search image
		vbx_set_vl( m->block_width );
		vbx_set_2D( m->block_height+m->search_height,  m->block_width*sizeof(input_type), sub_block_width*sizeof(input_type), 0 );
		vbx_2D( VVBU, VMOV, m->v_img_sub, m->v_img+j, 0 );

		// search the image columnwise
		vbx_set_vl( m->block_width*m->block_height );
		vbx_set_2D( m->search_height, m->search_width*sizeof(output_type), 0,  m->block_width*sizeof(input_type) );
		vbx_acc_2D( VVBWU, VABSDIFF, (vbx_uword_t*)m->v_result+j, m->v_block, m->v_img_sub );
	}

	// Write back result
	vbx_dma_to_host( result, m->v_result, m->result_size );

	return VBW_SUCCESS;
}
예제 #5
0
파일: test.c 프로젝트: 8l/mxp
int VBX_T(vbw_vec_reverse_test)()
{
	unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65,
	                      66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224,
	                      256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900,
	                      900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800,
	                      2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300,
	                      4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000,
	                      13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000,
	                      45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 };

	int retval;
	unsigned int N;
	unsigned int NBYTES;
	unsigned int NREPS = 100;
	unsigned int i,k;

	vbx_timestamp_t start=0,finish=0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;

	for( i=0; i<sizeof(aN)/4; i++ ) {
		N = aN[i];
		//printf( "testing with vector size %d\n", N );

		NBYTES = sizeof(vbx_sp_t)*N;
		if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue;

		vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES );
		vbx_sp_t *vdst = vbx_sp_malloc( NBYTES );
		//printf("bytes alloc: %d\n", NBYTES );

		if( !vsrc ) VBX_EXIT(-1);
		if( !vdst ) VBX_EXIT(-1);

		#if   ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF )
			unsigned int mask = 0x007F;
		#elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF )
			unsigned int mask = 0x7FFF;
		#else
			unsigned int mask = 0xFFFF;
		#endif

		vbx_set_vl( N );
		vbx( SV(T), VMOV, vdst,   -1, 0 );       // Fill the destination vector with -1
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Fill the source vector with enumerated values
		//VBX_T(print_vector)( "vsrcInit", vsrc, N );
		//VBX_T(print_vector)( "vdstInit", vdst, N );

		/** measure performance of function call **/
		vbx_sync();
		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N );
			vbx_sync();
		}
		finish = vbx_timestamp();
		printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );
		//VBX_T(print_vector)( "vsrcPost", vsrc, N );
		//VBX_T(print_vector)( "vdstPost", vdst, N );

		#if VERIFY_VBWARE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif

		printf("\treturn value: %X", retval);

		vbx_set_vl( N );
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Reset the source vector

		/** measure performance of simple algorithm **/
		vbx_sync();
		vbx_set_vl( 1 );
		vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );

		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 );
			vbx_sync();
		}
		finish = vbx_timestamp();

		printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );

		#if VERIFY_SIMPLE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif
			printf("\tcycles\n");

		vbx_sp_free();
	}

	vbx_sp_free();
	printf("All tests passed successfully.\n");

	return 0;
}
예제 #6
0
파일: vbw_vec_rev.cpp 프로젝트: 8l/mxp
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N )
{
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;

	vbx_mxp_t *this_mxp            = VBX_GET_THIS_MXP();
	const unsigned int NUM_LANES   = this_mxp->vector_lanes;

	//printf("\n%d\n",VBX_SKIP_ALL_CHECKS);

	// Can the whole vector fit in the scratchpad width?
	if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){
		vbx_set_vl( 1 );
		vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+N-1, v_src);
		return VBW_SUCCESS;
	}

	unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
	                            NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
	                            NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX);

	unsigned int N_w          = N >> VBW_RSHIFT_T_TO_W;                  // Equivalent number of words in the vector

	if( N_w && N_w <= threshold_w ) {
		if( VBW_ROT16){
			// remainder of elements that can't add to a whole word
			unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T);
			if( stub_t ) {
				vbx_set_vl( 1 );
				vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
				vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t);
				v_dst += stub_t;
			}
			vec_rev_rot16_w(v_dst, v_src, N_w);
		}else{
			vec_rev_w(v_dst, v_src, N_w);
		}

		if( VBW_ROT8){
			vec_rot8_h(v_dst, v_dst, N_w*2);
		}
		return VBW_SUCCESS;
	}


	const unsigned int SP_WIDTH_B       = this_mxp->scratchpad_alignment_bytes;
	const unsigned int FREE_BYTES       = vbx_sp_getfree();
	const unsigned int ODD_LOG_SEL      = NUM_LANES & 0x55555555 ? 1 : 0;

	vbx_word_t *v_mask, *v_result;
	vbx_word_t *v_scratch[2] = {0,0};

	unsigned int num_rows_w    = N_w / NUM_LANES;
	unsigned int working_set_w = num_rows_w * NUM_LANES;
	unsigned int tail_t        = N - (working_set_w << VBW_LSHIFT_W_TO_T);
	unsigned int remaining_w   = working_set_w;

	if( tail_t ) {
		vbx_set_vl( 1 );
		vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t);
		v_dst += tail_t;
	}

	vbx_word_t *v_src_w = (vbx_word_t *)v_src;
	vbx_word_t *v_dst_w = (vbx_word_t *)v_dst;

	if(!num_rows_w) {
		return VBW_SUCCESS;
	}

	remaining_w = working_set_w;
	while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) {
		if( remaining_w <= threshold_w*2 ) {
			if( VBW_ROT16){
				vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w);
			}else{
				vec_rev_w(v_dst_w, v_src_w, remaining_w);
			}

			if( VBW_ROT8){
				vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2);
			}
			return VBW_SUCCESS;
		}

		working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES );
		v_mask = v_dst_w + (working_set_w*2);
		remaining_w -= working_set_w;

		v_scratch[0] = v_dst_w;
		v_scratch[1] = v_dst_w + working_set_w;
		num_rows_w = working_set_w / NUM_LANES;
		v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w,
		                            v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
		if( v_result != v_dst_w ) {
			VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
			VBX_EXIT(-1);
		}
#endif

		if( VBW_ROT8){
			vec_rot8_h(v_result, v_result, working_set_w*2);
		}
		v_dst_w += working_set_w;
	}


	vbx_sp_push();

	v_scratch[0] = v_dst_w;
	v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[1] ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	num_rows_w = remaining_w / NUM_LANES;
	v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL],
	                            v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
	if( v_result != v_dst_w ) {
		VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
		VBX_EXIT(-1);
	}
#endif

	if( VBW_ROT8){
		vec_rot8_h(v_result, v_result, remaining_w*2);
	}
	vbx_sp_pop();
	return VBW_SUCCESS;
}
예제 #7
0
/* takes in precomputed bfly */
static int vector_fix_fft_dif_long_fly(short fr[], short fi[], short fr2[], short fi2[], short tw_r[], short tw_i[], short m, short inverse, short real)
{
	int i, j, l, k, scale, shift, a1,a2,bfly,mul,flight,swap,row_num;
	short  wr, wi;

	vptr_half v_fr, v_fi, v_fr2, v_fi2, v_tmp;
	vptr_half v_twr, v_twi;
	vptr_half v_arp, v_aip, v_brp, v_bip, v_crp, v_cip;
	vptr_half v_temp;
	vptr_half v_twr2, v_twi2;
	const int n = 1 << m;
	const int half = n >> 1;

	scale = 0;
	mul = 0;
	swap = m >> 1;

	l = m-1;
	flight = 1;
	bfly = half;

	const int INROWS = 1<<swap;
	const int INCOLS = 1<<(m-swap);

	if ( !(m%2) ){
		swap--;
	}

	// allocate space in vector memory for vectors
	v_fr  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fr2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );
	v_fi2 = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	v_twr   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi   = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_temp  = (vptr_half)vbx_sp_malloc( n*sizeof(vbx_half_t) );

	if( v_fr  == NULL || v_fi == NULL  || v_fr2 == NULL || v_fi2== NULL  || \
	    v_twr == NULL || v_twi == NULL || v_temp == NULL) {
	 	VBX_EXIT(-1);
	}

	v_twr2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	v_twi2  = (vptr_half)vbx_sp_malloc( half*sizeof(vbx_half_t) );
	if( v_twr2 == NULL || v_twi2 == NULL) {
	 	VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v_fr, fr, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_fi, fi, n*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twr, tw_r, half*sizeof(vbx_half_t) );
	vbx_dma_to_vector( v_twi, tw_i, half*sizeof(vbx_half_t) );

#if 1
        if(real){
            vector_fix_fft_untangle_real_scratch( v_fr, v_fi, v_fr2, v_fi2, v_twr,v_twi, m, inverse);
        }
#endif

	while (l > swap) {
		if (inverse) {
			// variable scaling, depending upon data
			shift = 0;
			if( isAbsOutOfRangeV(v_fr,v_fi,v_temp,n) ) {
				shift = 1;
				scale++;
			}
		} else {
			// fixed scaling, for proper normalization
			// -- overall factor of 1/n, distributed to maximize arithmetic accuracy
			shift = 1;
		}
		// shift will be performed on each data point exactly once during pass

		SWAP( v_fr, v_fr2, v_tmp );
		SWAP( v_fi, v_fi2, v_tmp );

		if (shift){
			vbx_set_vl( n );
			vbx(SVH,VSHR,  v_fr2, 1,  v_fr2 );
			vbx(SVH,VSHR,  v_fi2, 1,  v_fi2 );
		}

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VADD, v_fr, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VADD, v_fi, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1), sizeof(vbx_half_t)*(1<<l+1) );
		vbx_2D( VVH, VSUB, v_fr2, v_fr2, v_fr2 + (1<<l) );
		vbx_2D( VVH, VSUB, v_fi2, v_fi2, v_fi2 + (1<<l) );

		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fr[n>>1],  v_fr2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fi2,      v_twi );

		vbx_set_vl( n>>1 ); // vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VSUB, &v_fr[n>>1], &v_fr[n>>1], v_temp );

		vbx_set_vl( 1<<l );
		vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l+1), 0 );
		vbx_2D( VVH, VMULFXP, &v_fi[n>>1],  v_fi2,      v_twr );
		vbx_2D( VVH, VMULFXP,  v_temp,      v_fr2,      v_twi );

		vbx_set_vl( n>>1 ); //vbx_set_2D( flight, sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l), sizeof(vbx_half_t)*(1<<l) );
		vbx( VVH, VADD,    &v_fi[n>>1], &v_fi[n>>1], v_temp );

		l--;
		mul++;
		flight <<= 1;

		if( l > swap ) {
			vbx_set_vl( 1<<l );
			vbx( VVWH, VMOV, v_twr, v_twr, 0 );
			vbx( VVWH, VMOV, v_twi, v_twi, 0 );
		}
	}

	if ( !(m%2) ) {
		l++;
		flight >>=1;
	}
예제 #8
0
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS )
{
	typedef vbx_mm_t vbx_sp_t;

	int elements = INROWS * INCOLS;

	if(elements < SCALAR_THRESHOLD) {
		vbx_sync();  //in case we input is waiting on a DMA transfer
		int i,j;
		for(i = 0; i < INROWS; i++) {
			for(j = 0; j < INCOLS; j++) {
				out[j*INROWS+i] = in[i*INCOLS+j];
			}
		}
		return VBW_SUCCESS;
	}

	vbx_sp_push();

	vbx_sp_t *v_in;
	vbx_sp_t *v_out;

	int tile_height     = 0;
	int tile_width      = 0;
	int prev_tile_width = 0;
	int tile_y          = 0;
	int tile_x          = 0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes;
	int SP_SIZE = vbx_sp_getfree();
	int max_sp_elements   = vbx_sp_getfree() / sizeof(vbx_sp_t);
	int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t);


	if( INROWS == 1 || INCOLS == 1 ) {           // 1D transpose becomes a simple copy operation
		if( elements <= max_sp_elements ) {      // We can use the whole scratchpad for this
			v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
			vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );
			v_out = v_in;
			vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
		} else {                                 // To test this, you'll need a very large 1D matrix (or a small SP)
			tile_width = max_sp_elements;
			v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) );
			for (tile_x = 0; tile_x < elements; tile_x += tile_width) {
				if( tile_x + tile_width > elements) tile_width = elements - tile_x;
				vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) );
				v_out = v_in;
				vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) );
			}
		}
	} else if( elements < max_tile_elements ) {  // Matrix is small enough to handle entirely in SP
		v_in  = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );

		vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );

		vbw_mtx_xp(v_out,v_in,INROWS,INCOLS);

		vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
	} else {                                     // At this point we know at least one full tile will be needed
		#define QUICK_A_LANES_THRESHOLD 8        // Use merge transpose if there are at least this many lanes
		#define QUICK_A_TILE_WIDTH 128
		#define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH)
		#define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2)
		#define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t)))

		#define QUICK_B_LANES_THRESHOLD 16        // Use smaller merge transpose tile only if there are a lot of lanes
		#define QUICK_B_TILE_WIDTH 64             //     and only if larger tile A size cannot be used.
		#define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH)
		#define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2)
		#define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t)))

		int NUM_LANES = this_mxp->vector_lanes;
		int DMA_BYTES = this_mxp->dma_alignment_bytes;
		int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t);

		vbx_sp_t *v_out_sel;
		vbx_sp_t *vf = 0;

		if( NUM_LANES >= QUICK_A_LANES_THRESHOLD       // Check for appropriate conditions to use merge transpose tiles
					&& INCOLS >= QUICK_A_TILE_WIDTH
					&& INROWS >= QUICK_A_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_A_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD
					&& INCOLS >= QUICK_B_TILE_WIDTH
					&& INROWS >= QUICK_B_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_B_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else {
			findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim );
		}

		prev_tile_width = tile_width;

		v_in  = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );


		if( v_out==NULL ) {
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}

		vbx_sp_t *v[2] = { v_in, v_out };

		tile_y = 0;                              // Reset y position for new col
		while( tile_y < INROWS ) {
		vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
		vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
			tile_x = 0;                          // Reset x position for new row
			while( tile_x < INCOLS ) {

				vbx_dma_to_vector_2D(
						v_in,
						in+(tile_y*INCOLS)+tile_x,
						tile_width*sizeof(vbx_mm_t),
						tile_height,
						tile_width*sizeof(vbx_sp_t),
						INCOLS*sizeof(vbx_mm_t) );

				v_out_sel = v_out;                         // select v_out as default vector to DMA to MM

				/* *** merge transpose (matrix must be square and a power of 2 wide) *** */
				if( vf && tile_width == tile_height
							&& (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) {
					int src = 0;
					int n;
					for( n=1; n<tile_width; n *= 2 ) {     // can't do 1st iteration until entire tile is DMA'd in
						const int nn = 2*n;

						// copy the destination matrix
						vbx_set_vl( tile_width*tile_width );    // use v_in & v_out as working matrices (clobber v_in)
						vbxx(  VMOV, v[!src], v[src]);

						// do the work
						vbx_set_vl( n*tile_width );
						vbxx( VAND, vf, n, (vbx_enum_t*)0 );           // mask for merging: 0101010... then 00110011...
						vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 );
						vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n           , vf );
						vbxx_2D( VCMV_Z, v[!src]+n,            v[src]+n*tile_width, vf );

						src = !src;
					}

					v_out_sel = v[src];     // depending on the size of the mtx, the final result may be in v_in or v_out
				} else {
					vbx_set_vl( 1 );        // 2D and 3D will be set by the x and y edge conditions, even using merge
					vbxx_3D(VMOV, v_out, v_in );
				}

				vbx_dma_to_host_2D(
						out+(tile_x*INROWS)+tile_y,
						v_out_sel,
						tile_height*sizeof(vbx_mm_t),
						tile_width,
						INROWS*sizeof(vbx_mm_t),
						tile_height*sizeof(vbx_sp_t) );

				tile_x += tile_width;                 // Set up width for next tile
				if( tile_x + tile_width > INCOLS ) {  // Temporarily reduce tile width when reaching right edge of matrix
					tile_width = INCOLS - tile_x;
					vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
					vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
				}
			}
			tile_y += tile_height;                    // Set up width and height for next row of tiles
			tile_width = prev_tile_width;             // Restore original tile width for next row of tiles

			/* *** Permanently reduce tile height when reaching bottom of matrix *** */
			tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height;		}
	}
	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}
예제 #9
0
파일: vbx_mtx_fdct.c 프로젝트: 8l/mxp
void vbx_mtx_fdct( vbx_mtx_fdct_t *v, dt *block_v, dt *image,
	int start_x, int start_y, int end_x, int end_y,int num_tile_x, int num_tile_y )
{
//	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
//	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int BIG_TILE_SIZE = num_tile_x * num_tile_y * DCT_SIZE;

	int next_x=start_x+1;
	int next_y=start_y;
	int get_next=1;
	if( start_x == end_x   &&   start_y == end_y ) {
		get_next=0;
	}
	if( start_x == end_x ) {
		next_x = 0;
		next_y++;
	} 

	const vbx_half_t *vimageDMA = v->vimage[!v->db]; // dma
//	const vbx_half_t *vblockDMA = v->vblock[!v->db]; // dma // never used directly 

	const vbx_half_t *vimageVPU = v->vimage[ v->db]; // active
	const vbx_half_t *vblockVPU = v->vblock[ v->db]; // active

	const vbx_half_t *vblockTMP = v->vblock[ 2    ]; // temp

	const vbx_half_t *vcoeff    = v->vcoeff;
	const vbx_half_t *vprods    = v->vprods;
	const vbx_half_t *vaccum    = v->vaccum;
	const vbx_half_t *vflags    = v->vflags;

#if DMA
	// First, prefetch the next chunk of the next image for a future call to fdct_tile()
#if NUM_TILE_Y > 1
	if( get_next ) // get row 0
		getBigTileImageY( vimageDMA,
		        image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 );
#else
	if( get_next ) // get row 0
		getBigTileImage( vimageDMA,
		        image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE, 0 );
#endif
#endif

	int r;
	for( r=0; r < BLOCK_SIZE; r++ ) {
		// perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE );                                                                                              // for the length of tiled rows
		vbx_set_2D( BLOCK_SIZE, NUM_TILE_X*BLOCK_SIZE*sizeof(dt),                                    0, NUM_TILE_X*BLOCK_SIZE*sizeof(dt) ); // for all rows of tiled coeffiencents
		vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt),     NUM_TILE_X * DCT_SIZE*sizeof(dt),                               0  ); // for all groups Y
		vbx_3D( VVH, VMUL,                                vprods, vimageVPU + r*NUM_TILE_X*BLOCK_SIZE,                            vcoeff); // for all 'columns' of tiled data

#if ACCUMULATE
		// accumulate the multiply operations
#if 0 & USE_ACCUM_FLAGS 
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods+0, vprods+1 );
		vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) );
		vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags );
#elif BLOCK4
                //case DCT 4
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods, vprods+1 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+3 );
		vbx( VVH, VCMV_Z, vblockTMP+r, vaccum, vflags );
#else
                //correct?
		vbx_set_vl( BLOCK_SIZE );
		vbx_set_2D( BLOCK_SIZE,   NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) );
		vbx_set_3D( NUM_TILE_X,   BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt) );
#if NUM_TILE_Y == 1
		vbx_acc_3D( VVH, VOR,   vblockTMP + r,      vprods ,  vprods );
#else
		int y; 
		for (y=0; y< NUM_TILE_Y; y++){
			vbx_acc_3D( VVH, VOR,   vblockTMP + r + y*NUM_TILE_X*DCT_SIZE,      vprods+ y*NUM_TILE_X*DCT_SIZE,  vprods+ y*NUM_TILE_X*DCT_SIZE );
		}
#endif
#endif
#endif

#if 0
// dont do DMA READS here yet. a DMA WRITE may still be in progress, give it chance to finish
#if DMA
		// every other iteration, prefetch the next row of the next image
		// NB: with 2D DMA, we could issue this as a single DMA request at the top of the file
		// instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue
#if NUM_TILE_Y > 1
		if( !(r&1) && get_next )
			getBigTileImageY( vimageDMA,
			                  image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                  (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added
#else
		if( !(r&1) && get_next )
			getBigTileImage( vimageDMA,
			                 image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                 (1+((r-1)>>1)) ); //BLOCK_SIZE/2 rows added
#endif
#endif
#endif
	}

	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE );
	vbx( SVH, VSHR, vblockTMP, SHIFT_AMOUNT, vblockTMP );

	// now do the transposed version

	for( r=0; r < BLOCK_SIZE; r++ ) {
		// perform multiply of the whole BIG_TILE with row 'r' of the image matrix -- before had dct matrix switching
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE );                                                                                              // for the length of tiled rows
		vbx_set_2D( BLOCK_SIZE, NUM_TILE_X * BLOCK_SIZE*sizeof(dt),     NUM_TILE_X * BLOCK_SIZE*sizeof(dt),                            0 ); // for all 'columns' of tiled data 
		vbx_set_3D( NUM_TILE_Y, NUM_TILE_X * DCT_SIZE*sizeof(dt),       NUM_TILE_X * DCT_SIZE*sizeof(dt),                              0 ); // for all groups Y
		vbx_3D( VVH, VMUL,                             vprods,                        vblockTMP,  vcoeff + r*NUM_TILE_X*BLOCK_SIZE); // for all rows of tiled coeffients 

#if ACCUMULATE
		// accumulate the multiply operations
#if 0 & USE_ACCUM_FLAGS
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods+0, vprods+1 );
		vbx_set_2D( BLOCK_SIZE-2, 0, 0, sizeof(dt) );
		vbx_2D( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VCMV_Z, vblockVPU+r, vaccum,   vflags );

#elif BLOCK4
		//case DCT 4
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
		vbx( VVH, VADD, vaccum, vprods, vprods+1 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+2 );
		vbx( VVH, VADD, vaccum, vaccum, vprods+3 );
		//vbx( VVH, VCMV_Z, vblockVPU+r, vaccum, vflags );
		vbx_set_vl( NUM_TILE_X * BLOCK_SIZE - (BLOCK_SIZE-1) );                    // for the length of a tiled row
		vbx_set_2D( BLOCK_SIZE, 1*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), 0);// for all tiled rows 
#if NUM_TILE_Y == 1
		vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE, vaccum, vflags  );  // 
#else
		int y;
		for (y=0; y< NUM_TILE_Y; y++){
			vbx_2D(VVH, VCMV_Z, vblockVPU+r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE , vaccum+y*NUM_TILE_X*DCT_SIZE, vflags  );  // 
		}
#endif
#else
		//correct?
		vbx_set_vl( BLOCK_SIZE );                                                                                              // for the length of a row
		vbx_set_2D( BLOCK_SIZE,   sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt), NUM_TILE_X*BLOCK_SIZE*sizeof(dt) );            // for all rows in that block
		vbx_set_3D( NUM_TILE_X,   BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt),            BLOCK_SIZE*sizeof(dt) ); // for all tiled blocks horizontally(x)
#if NUM_TILE_Y == 1
		vbx_acc_3D( VVH, VOR,   vblockVPU + r*NUM_TILE_X*BLOCK_SIZE ,    vprods ,  vprods );
#else
		int y;
		for (y=0; y< NUM_TILE_Y; y++){ 
			vbx_acc_3D( VVH, VOR,   vblockVPU + r*NUM_TILE_X*BLOCK_SIZE + y*NUM_TILE_X*DCT_SIZE,      vprods+ y*NUM_TILE_X*DCT_SIZE,  vprods+ y*NUM_TILE_X*DCT_SIZE );
		}
#endif
#endif
#endif

#if DMA
		// every other iteration, prefetch the next row of the next image
		// NB: with 2D DMA, we could issue this as a single DMA request at the top of the file
		// instead, we must intersperse these 1D DMA strips to ensure they don't block the instruction queue
#if NUM_TILE_Y > 1
		//if( !(r&1) && r<(BLOCK_SIZE-1)  && get_next )
		if( get_next )
			getBigTileImageY( 
			                  vimageDMA,
			                  image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                  r );
			                  //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows
#else
		//if( !(r&1) && r<(BLOCK_SIZE-1)  && get_next )
		if( get_next )
			getBigTileImage( vimageDMA,
			                 image+next_x*NUM_TILE_X*BLOCK_SIZE+next_y*IMAGE_WIDTH*NUM_TILE_Y*BLOCK_SIZE,
			                 r );
			                 //(BLOCK_SIZE/2 +1+((r-1)>>1)) ); // BLOCK/2 -1 rows
#endif
#endif
	}

	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE );
	vbx( SVH, VSHR, vblockVPU, SHIFT_AMOUNT, vblockVPU );
#if DMA2
	// Write result back to memory as one big block
	vbx_dma_to_host( block_v, vblockVPU, BIG_TILE_SIZE*sizeof(dt) );
#endif 

	v->db = !v->db;
#ifdef DEBUG 
	{
		vbx_sync();
		int i,j;
		printf("%d\n", !db);
		for(i=0;i<BLOCK_SIZE*NUM_TILE_Y;i++){
			for(j=0;j<BLOCK_SIZE*NUM_TILE_X;j++){
				printf(" %4d", block_v[i*BLOCK_SIZE*NUM_TILE_X+j]);
			}
			printf("\n");
		}
	}
#endif
}
예제 #10
0
int vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int start_x, int start_y, int reset, const int image_width, const int image_height, const int image_pitch)
{
	int y, x, starty, startx;
	unsigned int sad, sad_min, y_min, x_min;
	vbx_uhalf_t *v_search_luma, *v_last_luma;
	vbx_uhalf_t *v_row_temp;
	vbx_uword_t *v_row;
	vbx_uword_t *v_sad;
	pixel color;

	if(*last_luma == NULL || reset){
		init_vector_motest(input_buffer, last_luma, motest_x, motest_y, start_x, start_y, image_pitch);
	}

	v_search_luma = vbx_sp_malloc( MOTEST_BUFFER_SIZE  * sizeof(vbx_uhalf_t) );
	v_last_luma   = vbx_sp_malloc( MOTEST_BLOCK_SIZE   * sizeof(vbx_uhalf_t) );
	v_row_temp    = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uhalf_t) );
	v_row         = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uword_t) );
	v_sad         = vbx_sp_malloc( MOTEST_SEARCH_SIZE  * sizeof(vbx_uword_t) );

	if(v_sad == NULL){
		printf("Not enough scratchpad for motest\n");
		while(1);
	}

	startx = *motest_x-(MOTEST_SEARCH_WIDTH/2);
	starty = *motest_y-(MOTEST_SEARCH_HEIGHT/2);
	if(startx < 0){
		startx = 0;
	}
	if(startx > image_width-MOTEST_BUFFER_WIDTH){
		startx = image_width-MOTEST_BUFFER_WIDTH;
	}
	if(starty < 0){
		starty = 0;
	}
	if(starty > image_height-MOTEST_BUFFER_HEIGHT){
		starty = image_height-MOTEST_BUFFER_HEIGHT;
	}

	vector_rectangle_to_luma(input_buffer, v_search_luma, v_row_temp, v_row, startx, starty, MOTEST_BUFFER_WIDTH, MOTEST_BUFFER_HEIGHT, image_pitch);
	vbx_dma_to_vector(v_last_luma, *last_luma, MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t));

	//Vector compute sad here

	vbx_set_2D(MOTEST_BLOCK_HEIGHT, sizeof(vbx_uword_t), MOTEST_BUFFER_WIDTH*sizeof(vbx_uhalf_t), MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t));

	for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){
		for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){
			vbx_set_vl(MOTEST_BLOCK_WIDTH);
			vbx_acc_2D(VVHWU, VABSDIFF, v_row, v_search_luma+(y*MOTEST_BUFFER_WIDTH)+x, v_last_luma);
			vbx_set_vl(MOTEST_BLOCK_HEIGHT/2);
			vbx_acc(VVWU, VADD, v_sad+(y*MOTEST_SEARCH_WIDTH)+x, v_row, v_row+MOTEST_BLOCK_HEIGHT/2);
		}

#if TOUCHSCREEN
#ifdef TOUCH_INTERRUPTS_VBX
		if (touchscreen_get_pen(pTouch)) {
			vbx_sp_free();
			return -1;
		}
#endif
#endif
	}

	vbx_sync();

	sad_min = INT_MAX;
	y_min = *motest_y;
	x_min = *motest_x;

	for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){
		for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){
			sad = v_sad[y*MOTEST_SEARCH_WIDTH+x];
			if(sad < sad_min){
				sad_min = sad;
				x_min = x+startx;
				y_min = y+starty;
			} else if(sad == sad_min) {
				if( (abs( x             - MOTEST_SEARCH_WIDTH/2) + abs( y             - MOTEST_SEARCH_HEIGHT/2)) <
				    (abs((x_min-startx) - MOTEST_SEARCH_WIDTH/2) + abs((y_min-starty) - MOTEST_SEARCH_HEIGHT/2))) {
					x_min = x+startx;
					y_min = y+starty;
				}
			}
		}
	}

	color.r = 0;
	color.g = 255;
	color.b = 0;
	color.a = 0;
	scalar_draw_line(*motest_x+(MOTEST_BLOCK_WIDTH/2), *motest_y+(MOTEST_BLOCK_HEIGHT/2), x_min+(MOTEST_BLOCK_WIDTH/2), y_min+(MOTEST_BLOCK_HEIGHT/2), color, input_buffer, image_pitch);

	*motest_y = y_min;
	*motest_x = x_min;

	vbx_set_vl(MOTEST_BLOCK_WIDTH);
	for(y = 0; y < MOTEST_BLOCK_HEIGHT; y++){
		vbx(VVHU, VMOV, v_last_luma+(y*MOTEST_BLOCK_WIDTH), v_search_luma+((y+y_min-starty)*MOTEST_BUFFER_WIDTH)+(x_min-startx), 0);
	}
	vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type));

	draw_motest(input_buffer, *motest_x, *motest_y, image_pitch);
	//simple hack to draw thicker
	draw_motest(input_buffer, *motest_x+1, *motest_y+1, image_pitch);

	vbx_sp_free();
	return 0;
}