Exemplo n.º 1
0
vbx_void_t *vbx_sp_malloc_debug( int LINE,const char *FNAME, size_t num_bytes )
{
	// print pretty error messages
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	if( !this_mxp || !this_mxp->init ) {
		VBX_PRINTF( "ERROR: failed to call _vbx_init().\n" );
		VBX_FATAL(LINE,FNAME,-1);
	}

	// pad to scratchpad width to reduce occurrence of false hazards
	size_t padded = VBX_PAD_UP( num_bytes, this_mxp->scratchpad_alignment_bytes );
	size_t freesp = (size_t)(this_mxp->scratchpad_end - this_mxp->sp); //VBX_SCRATCHPAD_END - (size_t)vbx_sp; // vbx_sp_getfree();

	vbx_void_t  *result = NULL;
	if( VBX_DEBUG_LEVEL && (num_bytes==0) ) {
		print_sp_malloc_null();
	} else if( VBX_DEBUG_LEVEL && freesp < padded ) {
		print_sp_malloc_full( num_bytes, padded );
	} else if( num_bytes > 0  &&  freesp >= padded ) {
		result        = this_mxp->sp;
		this_mxp->sp += padded;
#if VBX_DEBUG_SP_MALLOC
		printf("sp_malloc %d bytes padded to %d, sp=0x%08x\n", num_bytes, padded, this_mxp->sp);
#endif
	}

	if( !result ) {
		VBX_FATAL(LINE,FNAME,-1);
	}
	return result;
}
Exemplo n.º 2
0
Arquivo: test.c Projeto: cirqueit/mxp
int main(void)
{
	vbx_test_init();

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int required_vectors = 4;

	int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors;

	int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH );

	double scalar_time, vector_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nAdd test...\n" );
	printf( "Vector length: %d\n", N );

	vbx_mm_t *scalar_in1 = malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *scalar_in2 = malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) );

	vbx_mm_t *vector_in1 = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *vector_in2 = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
//	vbx_mm_t *vector_out = vector_in2 - 5;


	vbx_sp_t *v_in1 = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
	vbx_sp_t *v_in2 = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
	vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
//	vbx_sp_t *v_out = v_in2-5;

	VBX_T(test_zero_array)( scalar_out, N );
	VBX_T(test_zero_array)( vector_out, N );

	VBX_T(test_init_array)( scalar_in1, N, 1 );
	VBX_T(test_copy_array)( vector_in1, scalar_in1, N );
	VBX_T(test_init_array)( scalar_in2, N, 1 );
	VBX_T(test_copy_array)( vector_in2, scalar_in2, N );

	VBX_T(test_print_array)( scalar_in1, PRINT_LENGTH );
	VBX_T(test_print_array)( scalar_in2, PRINT_LENGTH );

	scalar_time = test_scalar( scalar_out, scalar_in1, scalar_in2, N );
	VBX_T(test_print_array)( scalar_out, PRINT_LENGTH);

	vbx_dma_to_vector( v_in1, (void *)vector_in1, N*sizeof(vbx_sp_t) );
	vbx_dma_to_vector( v_in2, (void *)vector_in1, N*sizeof(vbx_sp_t) );
	vector_time = test_vector( v_out, v_in1, v_in2, N, scalar_time );
	vbx_dma_to_host( (void *)vector_out, v_out, N*sizeof(vbx_sp_t) );
	vbx_sync();
	VBX_T(test_print_array)( vector_out, PRINT_LENGTH );

	errors += VBX_T(test_verify_array)( scalar_out, vector_out, N );

	VBX_TEST_END(errors);
	return 0;
}
Exemplo n.º 3
0
Arquivo: test.c Projeto: cirqueit/mxp
int main(void)
{
	vbx_test_init();
	vbx_mxp_print_params();
	int errors=0;
	unsigned instr_cycles,instr_count, dma_cycles,dma_count;
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int lanes= this_mxp->vector_lanes;
	int dma_width=this_mxp->dma_alignment_bytes /4;
	debug(lanes);
	debug(dma_width);
	vbx_set_vl(-1);
	VBX_COUNTER_RESET();
	vbx(SVW,VMOV,0,0,0);
	vbx_sync();
	if(VBX_SIMULATOR)
		printf("simulator\n");
	else
		printf("not simulator\n");
	instr_cycles=VBX_GET_WRITEBACK_CYCLES();
	dma_cycles=VBX_GET_DMA_CYCLES();
	dma_count=VBX_GET_DMAS();
	instr_count=VBX_GET_INSTRUCTIONS();


	debug(instr_cycles);
	debug(dma_cycles);
	debug(dma_count);
	debug(instr_count );

	VBX_TEST_END(errors);
	return 0;
}
Exemplo n.º 4
0
vbx_void_t *vbx_sp_malloc_nodebug( size_t num_bytes )
{
	if( VBX_DEBUG_LEVEL && 0 ) {
		// print pretty error messages
		return vbx_sp_malloc_debug( __LINE__, __FILE__, num_bytes );
	}

	// do it, but do not print pretty error messages
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();

	// check for valid argument values
	if( !this_mxp  ||  num_bytes==0 )
		return NULL;

	// add padding and allocate
	// pad to scratchpad width to reduce occurrence of false hazards
	size_t padded = VBX_PAD_UP( num_bytes, this_mxp->scratchpad_alignment_bytes );
	vbx_void_t *old_sp = this_mxp->sp;
	this_mxp->sp += padded;

	// scratchpad full
	if( this_mxp->sp > this_mxp->scratchpad_end ) {
		this_mxp->sp = old_sp;
		return NULL;
	}

	// success
	return old_sp;
}
Exemplo n.º 5
0
int dma_bandwidth_test()
{
	const int num_iter = 64;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int scratchpad_size = this_mxp->scratchpad_size;

	uint8_t *buf = vbx_shared_malloc(scratchpad_size);
	vbx_ubyte_t *v_buf = vbx_sp_malloc(scratchpad_size);

	vbx_timestamp_t time_start, time_stop;

	int i;
	int len;
	int to_host;
	int errors = 0;

	vbx_mxp_print_params();

	// dma_alignment_bytes gives DMA master data bus width in bytes.
	double bytes_per_sec = \
		(((double) this_mxp->core_freq) * this_mxp->dma_alignment_bytes);
	double max_megabytes_per_sec = bytes_per_sec/(1024*1024);
	printf("\nMax available bandwidth = %s Megabytes/s\n",
	       vbx_eng(max_megabytes_per_sec, 4));

	printf("\n");

	for (to_host = 0; to_host < 2; to_host++) {
		for (len = 32; len <= scratchpad_size ; len *= 2) {
			printf("DMA %s, %d bytes\n", to_host ? "write" : "read", len);
			vbx_timestamp_start();
			if (to_host) {
				time_start = vbx_timestamp();
				for (i = 0; i < num_iter; i++) {
					vbx_dma_to_host(buf, v_buf, len);
				}
				vbx_sync();
				time_stop = vbx_timestamp();
			} else {
				time_start = vbx_timestamp();
				for (i = 0; i < num_iter; i++) {
					vbx_dma_to_vector(v_buf, buf, len);
				}
				vbx_sync();
				time_stop = vbx_timestamp();
			}
			print_dma_bandwidth(time_start, time_stop, len, num_iter,
			                    max_megabytes_per_sec);
			printf("\n");
		}
		printf("\n");
	}

	vbx_shared_free(buf);
	vbx_sp_free();

	return errors;
}
Exemplo n.º 6
0
void vbx_sp_free_nodebug()
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	if( this_mxp )  {
		this_mxp->sp = this_mxp->scratchpad_addr;
		this_mxp->spstack_top = 0;
	}
}
Exemplo n.º 7
0
// --------------------------------------------------------
// Scratchpad manipulation routines
int vbx_sp_getused()
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int used = 0;
	if( this_mxp )
		used = (int)(this_mxp->sp - this_mxp->scratchpad_addr);
	return used;
}
Exemplo n.º 8
0
int vbx_sp_getfree()
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int free = 0;
	if( this_mxp )
		free = (int)(this_mxp->scratchpad_end - this_mxp->sp);
	return free;
}
Exemplo n.º 9
0
Arquivo: test.c Projeto: 8l/mxp
int deep_vector_copy_ext_test()
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int retval;
	int num_test;
	int total_errors = 0;
	const int NUM_TESTS = TEST_DEEP_MM_NUM_TESTS;
	int NB = this_mxp->scratchpad_size * 10;
	int NT = NB / sizeof(vbx_mm_t);

	vbx_mm_t *v = vbx_shared_malloc( NB );

	srand( 0x1a84c92a );

	int i;

	for( num_test=0; num_test < NUM_TESTS ; num_test++ ) {

		//	initialize the whole working space
		for( i=0; i<NT; i++ ) {
			v[i] = i & MSK;
		}

		// choose random src/dest/length:
		// -- randomly pick the dest
		// -- set a window size of 2*K around the dest
		// -- randomly pick the src within the window
		// -- randomly pick the length, subject to end-of-scratchpad
		// -- this 'window' rule increases probability of overlaps
		// -- rough distribution: 30% short (pipeline) overlaps, 20% long overlaps, 50% no overlap

		int K, N1, N2, NN;
		N1 = rand() % NT;
		K  = 1 + rand() % ((N1 > 0)? min(min(N1, NT-N1), 1024): min(NT, 1024));
		N2 = N1 - K + rand() % (2*K);
		NN = rand() % (NT - max(N1,N2));
		vbx_mm_t *dst = v + N1;
		vbx_mm_t *src = v + N2;
		printf("test:%d src:0x%08x dst:0x%08x len:%08d", num_test, N1, N2, NN );

		// do the copy
		retval = VBX_T(vbw_vec_copy_ext)( dst, src, NN );
		vbx_sync();
		printf(" retval:0x%04x\n",retval);

		// ensure the copy was done properly
		int errors = verify_copy(v,     0,    N1,       0, "head")
		           + verify_copy(v,    N1, NN+N1, (N2-N1), "copy")
		           + verify_copy(v, NN+N1,    NT,       0, "tail");
		total_errors += errors;
		if( errors ) {
			//break;
		}
	}

	return total_errors;
}
Exemplo n.º 10
0
void vbx_sp_free_debug( int LINE, const char *FNAME )
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	if( !this_mxp )  {
		VBX_PRINTF( "ERROR: failed to call _vbx_init().\n" );
		VBX_FATAL(LINE,FNAME,-1);
	} else {
		this_mxp->sp = this_mxp->scratchpad_addr;
		this_mxp->spstack_top = 0;
	}
}
Exemplo n.º 11
0
Arquivo: test.c Projeto: 8l/mxp
int main(void)
{
	vbx_test_init();

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;


	int N = VBX_SCRATCHPAD_SIZE/sizeof(vbx_word_t)/12;
	N=1024;
	int PRINT_LENGTH = min(N, MAX_PRINT_LENGTH);

	double scalar_time, vector_time;
	int errors=0;

	vbx_mxp_print_params();
	printf("\nVector power test...\n");
	printf("Vector length: %d\n", N);

	vbx_word_t *scalar_in1 = malloc( N*sizeof(vbx_word_t) );
	vbx_word_t *scalar_in2 = malloc( N*sizeof(vbx_word_t) );
	vbx_word_t *scalar_out = malloc( N*sizeof(vbx_word_t) );

	vbx_word_t *vector_in1 = vbx_shared_malloc( N*sizeof(vbx_word_t) );
	vbx_word_t *vector_in2 = vbx_shared_malloc( N*sizeof(vbx_word_t) );
	vbx_word_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_word_t) );

	if(vector_out==NULL){
		printf("malloc_failed\n");
		return 1;
	}

	test_zero_array_word( scalar_out, N );
	test_zero_array_word( vector_out, N );

	test_init_array_word( scalar_in1, N, 5 );
	test_copy_array_word( vector_in1, scalar_in1, N );
	test_init_array_word( scalar_in2, N, 112 );
	test_copy_array_word( vector_in2, scalar_in2, N );

	test_print_array_word( scalar_in1, PRINT_LENGTH );
	test_print_array_word( scalar_in2, PRINT_LENGTH );

	scalar_time = test_scalar_power( scalar_out, scalar_in1, scalar_in2, N);
	test_print_array_word( scalar_out, PRINT_LENGTH );

	vector_time = test_vector_power( vector_out, vector_in1, vector_in2, N, scalar_time );
	test_print_array_word( vector_out, PRINT_LENGTH );
	errors += test_verify_array_word( scalar_out, vector_out, N );


	VBX_TEST_END(errors);
	return 0;
}
Exemplo n.º 12
0
void vbx_sp_push_realloc(){
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	//double the stack space
	this_mxp->spstack_max*=2;
	size_t spstack_size=this_mxp->spstack_max*sizeof(void*);

	printf("realloc sp_stack %d\n",this_mxp->spstack_max);
	this_mxp->spstack=(void**)realloc((void*)this_mxp->spstack,spstack_size);

	if ( !this_mxp->spstack ) {
		VBX_PRINTF("ERROR: Failed to malloc %d bytes for spstack.\n", (int)spstack_size);
		VBX_FATAL(__LINE__, __FILE__, -1);
	}
}
Exemplo n.º 13
0
void vbx_sp_set_nodebug( vbx_void_t *new_sp )
{
	if( VBX_DEBUG_LEVEL ) {
		// print pretty error messages
		vbx_sp_set_debug( __LINE__, __FILE__, new_sp );
	}

	// do it, but do not print pretty error messages
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	if( this_mxp
	           && (this_mxp->scratchpad_addr <= new_sp && new_sp <= this_mxp->scratchpad_end)
	           && VBX_IS_ALIGNED(new_sp, 4) ) {
		this_mxp->sp = new_sp;
	}
}
Exemplo n.º 14
0
void vbx_sp_set_debug( int LINE, const char *FNAME, vbx_void_t *new_sp )
{
	// print pretty error messages
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	if( !this_mxp )  {
		VBX_PRINTF( "ERROR: failed to call _vbx_init().\n" );
		VBX_FATAL(LINE,FNAME,-1);
	} else if( (this_mxp->scratchpad_addr <= new_sp && new_sp <= this_mxp->scratchpad_end)
	           && VBX_IS_ALIGNED(new_sp, 4) ) {
		this_mxp->sp = new_sp;
	} else {
		VBX_PRINTF( "ERROR: attempt to set scratchpad to illegal or unaligned address 0x%08lx.\n", (long int)new_sp );
		VBX_FATAL(LINE,FNAME,-1);
	}
}
Exemplo n.º 15
0
Arquivo: test.c Projeto: 8l/mxp
int VBX_T(vbw_vec_reverse_test)()
{
	unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65,
	                      66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224,
	                      256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900,
	                      900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800,
	                      2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300,
	                      4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000,
	                      13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000,
	                      45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 };

	int retval;
	unsigned int N;
	unsigned int NBYTES;
	unsigned int NREPS = 100;
	unsigned int i,k;

	vbx_timestamp_t start=0,finish=0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;

	for( i=0; i<sizeof(aN)/4; i++ ) {
		N = aN[i];
		//printf( "testing with vector size %d\n", N );

		NBYTES = sizeof(vbx_sp_t)*N;
		if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue;

		vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES );
		vbx_sp_t *vdst = vbx_sp_malloc( NBYTES );
		//printf("bytes alloc: %d\n", NBYTES );

		if( !vsrc ) VBX_EXIT(-1);
		if( !vdst ) VBX_EXIT(-1);

		#if   ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF )
			unsigned int mask = 0x007F;
		#elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF )
			unsigned int mask = 0x7FFF;
		#else
			unsigned int mask = 0xFFFF;
		#endif

		vbx_set_vl( N );
		vbx( SV(T), VMOV, vdst,   -1, 0 );       // Fill the destination vector with -1
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Fill the source vector with enumerated values
		//VBX_T(print_vector)( "vsrcInit", vsrc, N );
		//VBX_T(print_vector)( "vdstInit", vdst, N );

		/** measure performance of function call **/
		vbx_sync();
		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N );
			vbx_sync();
		}
		finish = vbx_timestamp();
		printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );
		//VBX_T(print_vector)( "vsrcPost", vsrc, N );
		//VBX_T(print_vector)( "vdstPost", vdst, N );

		#if VERIFY_VBWARE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif

		printf("\treturn value: %X", retval);

		vbx_set_vl( N );
		vbx( SE(T), VAND, vsrc, mask, 0 );       // Reset the source vector

		/** measure performance of simple algorithm **/
		vbx_sync();
		vbx_set_vl( 1 );
		vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );

		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 );
			vbx_sync();
		}
		finish = vbx_timestamp();

		printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );

		#if VERIFY_SIMPLE_ALGORITHM
			VBX_T(verify_vector)( vsrc, vdst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif
			printf("\tcycles\n");

		vbx_sp_free();
	}

	vbx_sp_free();
	printf("All tests passed successfully.\n");

	return 0;
}
Exemplo n.º 16
0
vbx_void_t *vbx_sp_get()
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	return this_mxp ? this_mxp->sp : NULL;
}
Exemplo n.º 17
0
int main(void)
{
	vbx_test_init();
	typedef vbx_word_t vbx_mm_t;
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t );
	N = 20;
	int M = 20;

	int PRINT_LENGTH =  N<MAX_PRINT_LENGTH ? N : MAX_PRINT_LENGTH ;
	//	int PRINT_ROWS = PRINT_LENGTH;
	int PRINT_ROWS = M<MAX_PRINT_LENGTH ? N : MAX_PRINT_LENGTH;
	int PRINT_COLS = PRINT_LENGTH;

	double scalar_time, vector_time,vector2_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nMatrix multiply test...\n" );
	printf( "Matrix dimensions: %d,%d\n", N, M );


	vbx_mm_t  *scalar_in1 = (vbx_mm_t*)malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *scalar_in2 = (vbx_mm_t*)malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *scalar_out = (vbx_mm_t*)malloc( N*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *vector_in1 = (vbx_mm_t*)vbx_shared_malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *vector_in2 = (vbx_mm_t*)vbx_shared_malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *vector_out = (vbx_mm_t*)vbx_shared_malloc( N*N*sizeof(vbx_mm_t ) );
	if ( scalar_in1 == NULL ||
	     scalar_in2 == NULL ||
	     scalar_out == NULL ||
	     vector_in1 == NULL ||
	     vector_in2 == NULL ||
	     vector_out == NULL ){
		printf("Malloc failed\n");
		VBX_TEST_END(1);
		return 0;
	}



	test_zero_array_word(scalar_out, N*N );
	test_zero_array_word(vector_out, N*N );

	test_init_array_word( scalar_in1, M*N, 1 );
	test_copy_array_word( vector_in1, scalar_in1, M*N );
	test_init_array_word( scalar_in2, M*N, 999 );
	//scalar_mtx_xp_MN_word( vector_in2, scalar_in2, N, N );
	test_copy_array_word( vector_in2, scalar_in2, M*N );

	test_print_matrix_word( scalar_in1, PRINT_COLS, PRINT_ROWS, M );
	test_print_matrix_word( scalar_in2, PRINT_ROWS, PRINT_COLS, N );

	//change print sizes for outputs
	PRINT_ROWS=PRINT_COLS=N<PRINT_LENGTH?N:PRINT_LENGTH;

	scalar_time = test_scalar( scalar_out, scalar_in1, N, M, scalar_in2, M, N);
	test_print_matrix_word( scalar_out, PRINT_COLS, PRINT_ROWS, N );


	vector_time = test_vector( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time );
	test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N );
	errors += test_verify_array_word( scalar_out, vector_out, N*N);

	vector2_time = test_vector_trans( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time );
	test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N );
	errors += test_verify_array_word( scalar_out, vector_out, N*N);

	vector2_time = test_vector_sp( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time );
	test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N );
	errors += test_verify_array_word( scalar_out, vector_out, N*N);

	vbx_shared_free(vector_out);
	vbx_shared_free(vector_in2);
	vbx_shared_free(vector_in1);
	free(scalar_out);
	free(scalar_in2);
	free(scalar_in1);

	//errors += orig_test();

	VBX_TEST_END(errors);
	return 0;
}
Exemplo n.º 18
0
//vector version of rgb converter
void vector_blend(
    output_pointer img_out, input_pointer img_in1, input_pointer img_in2,
    unsigned int num_row, unsigned int num_column, intermediate_type blending_const )
{
    intermediate_type *v_img1[2];
    input_type        *v_img2[2];
    intermediate_type *v_temp;

    intermediate_type blending_const_bar = 256-blending_const;
    int j;

    vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
    const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
    const int VBX_WIDTH_BYTES     = this_mxp->vector_lanes * sizeof(int);
    const int VBX_DMA_ALIGNMENT   = this_mxp->dma_alignment_bytes;

    unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type)));
    chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT );

    unsigned int chunk_size_old    = chunk_size;
    unsigned int vector_length     = chunk_size;
    unsigned int vector_length_old = vector_length;

    v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );
    v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );
    v_img2[0] = (input_type        *)vbx_sp_malloc( chunk_size*sizeof(input_type) );
    v_img2[1] = (input_type        *)vbx_sp_malloc( chunk_size*sizeof(input_type) );
    v_temp    = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) );

    if( v_temp == NULL ) {
        VBX_EXIT(0xBADDEAD);
    }

    int bufselect = 0;

    vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) );
    vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) );

    for( j=0; j<num_row*num_column; j+=vector_length_old ) {
        vbx_set_vl(vector_length);

        if( j > 0 ) {
            vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) );
        }

        if( (j+vector_length_old) < (num_row*num_column-1) ) {
            if( (j+vector_length_old*2) >= num_row*num_column ) {
                vector_length =  num_row*num_column - j - vector_length_old;
                chunk_size = vector_length;
            }
            vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) );
            vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) );
        }

        vbx( SVBHU, VMULLO, v_temp,            blending_const,     v_img1[bufselect] );
        vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] );
        vbx( VVHU,  VADD,   v_img1[bufselect], v_img1[bufselect],  v_temp );
        vbx( SVHBU, VSHR,   v_img1[bufselect], 8,                  v_img1[bufselect] );

        bufselect = 1-bufselect;
    }

    vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) );
    vbx_sp_free();
    vbx_sync();
}
Exemplo n.º 19
0
Arquivo: test.c Projeto: 8l/mxp
int compare_vbx_lut_to_vbx_lut_ci(int stage, int max_print_errors)
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int vci_lanes = this_mxp->vcustom0_lanes;
    int sz = this_mxp->scratchpad_size/(16*sizeof(vbx_ubyte_t));

    vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    if(v_idx == NULL) {
        printf("failed to allocate in compare_vbx_lut_to_vbx_lut_ci\n");
    }

    unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));
    unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));

    int f, n, s, errors = 0;
    for (n = 0; n < sz; n++) {
        v_pattern[n] = (n & 0xff);
    }

    for (f = 0; f < face_lbp[stage].count; f++) {
        lbp_feat_t feat = face_lbp[stage].feats[f];

        vbx_set_vl(sz);
        int total = f;
        s = 0;
        while(s < stage){
            total += face_lbp[s].count;
            s++;
        }

        if(total < 256) {
            vbx(SVBU, VLBPLUT, v_lutc, total, v_pattern);
        } else {
            vbx(SVBS, VLBPLUT, v_lutc, total-256, v_pattern);
        }

        vbx(SVB, VMOV, v_pass, feat.fail, 0);
        /* check if pattern is in lut */
        vbx(SVBU, VSHR, v_group, 5, v_pattern);
        for (n = 0; n < 8; n++) {
            vbx(SVB, VADD, v_sel, -n, v_group);
            vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel);
        }

        vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern);
        vbx(VVWB, VSHR, v_lut, v_idx, v_lut);
        vbx(SVB, VAND, v_lut, 1, v_lut);
        vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut);

        vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char));
        vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char));
        vbx_sync();

        errors += match_array_byte(lut, lut_c, "custom_lut", sz, 1, 0, max_print_errors, 0, 0);

    }
    vbx_sp_free();
    vbx_shared_free(lut);
    vbx_shared_free(lut_c);
    return errors;
}
Exemplo n.º 20
0
Arquivo: test.c Projeto: 8l/mxp
int main(void)
{
	vbx_test_init();

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	const int required_vectors = 4;

	int N = VBX_PAD_DN(VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors, this_mxp->scratchpad_alignment_bytes);

	int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH );

	double scalar_time, vector_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nVector copy test...\n" );
	printf( "Vector length: %d\n", N );

	vbx_mm_t *scalar_in  = malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) );

	vbx_mm_t *vector_in  = vbx_shared_malloc( N*sizeof(vbx_mm_t) );
	vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) );

	vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) );
	vbx_sp_t *v_in = vbx_sp_malloc( N*sizeof(vbx_sp_t) );

	VBX_T(test_zero_array)( scalar_in, N );
	VBX_T(test_zero_array)( vector_in, N );

	VBX_T(test_init_array)( scalar_in, N, 1 );
	VBX_T(test_copy_array)( vector_in, scalar_in, N );

	scalar_time = test_scalar( scalar_out, scalar_in, N );
	VBX_T(test_print_array)( scalar_out, PRINT_LENGTH );

	vbx_dma_to_vector( v_in, vector_in, N*sizeof(vbx_sp_t) );
	vector_time = test_vector( v_out, v_in, N, scalar_time );
	vbx_dma_to_host(vector_out, v_out, N*sizeof(vbx_sp_t) );
	vbx_sync();
	VBX_T(test_print_array)( vector_out, PRINT_LENGTH );

	errors += VBX_T(test_verify_array)( scalar_out, vector_out, N );

	vbx_sp_free();

#if TEST_DEEP_SP
	errors += deep_vector_copy_test();
#endif

#if DEBUG_MAKE_SP_FULL
	vbx_sp_malloc(vbx_sp_getfree());
#endif

#if TEST_DEEP_MM
	errors += deep_vector_copy_ext_test();
#endif

	VBX_TEST_END(errors);

	return 0;
}
Exemplo n.º 21
0
int main_tile()
{
	int i, j, k, l, base, block_num;
	int x, y;

	int time_start, time_stop;
	unsigned int cycles;
	double vbx_time, scalar_time;
	int wrong;

	int total_errors = 0;

	//all of the initialization can be hard coded without any computation
	vbx_mtx_fdct_t *v = vbx_mtx_fdct_init( coeff_v, image );
	vbx_timestamp_start();

	printf("\nGenerating initial data...\n");

	dt *image  = (dt *) malloc( IMAGE_WIDTH * IMAGE_HEIGHT * sizeof(dt) );
	GenerateRandomImage( image, IMAGE_WIDTH, IMAGE_HEIGHT, 0/*seed*/ );

	// Allocate memory to store results.
	// Results are computed BIGTILE_SIZE halfwords at a time.
	const int BIGTILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE;
	dt *block_s =                   malloc( BIGTILE_SIZE * sizeof(dt) );
	dt *block_v = (dt *) vbx_shared_malloc( BIGTILE_SIZE * sizeof(dt) );
	dt *coeff_v = (dt *) vbx_shared_malloc( BIGTILE_SIZE * sizeof(dt) );

	//Make an uncached 1D version of the coeff matrix
	for (i = 0; i < NUM_TILE_Y; i++) {             // row
		for (j = 0; j < BLOCK_SIZE; j++) {         // row
			for (k = 0; k < NUM_TILE_X; k++) {     // col
				for (l = 0; l < BLOCK_SIZE; l++) { // col
					coeff_v[i*NUM_TILE_X*DCT_SIZE + j*DCT_SIZE + k*BLOCK_SIZE + l] = cs[j][l];
				}
			}
		}
	}

#ifdef DEBUG
	printf("input matrix is:\n");
	for (i = 0; i < BLOCK_SIZE; i++) {
		base = i * BLOCK_SIZE;
		for (j = 0; j < BLOCK_SIZE; j++) {
			printf("%d ", (int) block_s[base + j]);
		}
		printf("\n");
	}
#endif

	printf("\nRunning DCT...\n");

	time_start = vbx_timestamp();
	for( y = 0; y < IMG_DOWN; y++ ) {
		for( x = 0; x < IMG_ACROSS; x++ ) {
			vbx_mtx_fdct_scalar( block_s, (dt*)cs, image, x/*start_x*/, y/*start_y*/, NUM_TILE_X, NUM_TILE_Y );
		}
	}
	time_stop = vbx_timestamp();

	cycles = time_stop - time_start;
	scalar_time = (double) cycles;
	scalar_time /= (double) vbx_timestamp_freq();
	scalar_time *= 1000.0;		//ms
	vbx_timestamp_t mxp_cycles = vbx_mxp_cycles(cycles);

	printf("%dx%d Block Size\n", BLOCK_SIZE, BLOCK_SIZE);
	printf("Finished, scalar CPU took %0.3f ms \n", scalar_time);
	printf(" CPU Cycles: %d\n", (int) mxp_cycles);
	printf(" CPU Cycles per block: %f\n", mxp_cycles / ((double) (NUM_BLOCKS)));

	vbx_sync(); // wait for image to be prefetched

	time_start = vbx_timestamp();
	for( y = 0; y < IMG_DOWN; y++ ) {
		for( x = 0; x < IMG_ACROSS; x++ ) {
			vbx_mtx_fdct( v, block_v, image, x/*start_x*/, y/*start_y*/, IMG_ACROSS-1,IMG_DOWN-1,NUM_TILE_X, NUM_TILE_Y );
		}
	}
	time_stop = vbx_timestamp();

	cycles = time_stop - time_start;
	vbx_time = (double) cycles;
	vbx_time /= (double) vbx_timestamp_freq();
	vbx_time *= 1000.0;			//ms
	mxp_cycles = vbx_mxp_cycles(cycles);

	printf("Finished, MXP took %0.3f ms \n", vbx_time);
	printf(" CPU Cycles: %d\n", (int) mxp_cycles);
	printf(" CPU Cycles per block: %f\n", mxp_cycles / ((double) (NUM_BLOCKS)));
	printf(" Speedup: %f\n", scalar_time / vbx_time);

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	double vbx_mbps = (double) (NUM_BLOCKS) * 1000 / vbx_time;	// blocks per second
	printf("V%d@%dMHz: %dx%d tile, %dx%d blocks, %f blocks/s, %f megapixel/s\n",
	       this_mxp->vector_lanes, this_mxp->core_freq / 1000000, 
	       NUM_TILE_Y, NUM_TILE_X, 
	       BLOCK_SIZE, BLOCK_SIZE,
	       vbx_mbps, (vbx_mbps * DCT_SIZE) / 1000000);

	printf("\nChecking results...\n");

	wrong = 0;
	for (block_num = 0; block_num < NUM_BLOCKS; block_num++) {
		for (i = 0; i < BLOCK_SIZE; i++) {
			base = i * BLOCK_SIZE;
			for (j = 0; j < BLOCK_SIZE; j++) {
				if (block_s[block_num * DCT_SIZE + base + j] != block_v[block_num * DCT_SIZE + base + j]) {
					if (wrong < 5) {
						printf("\nError at %d [%d,%d], result is %d, should be %d\n",
							   block_num, i, j, (int) block_v[block_num * DCT_SIZE + base + j],
							   (int) block_s[block_num * DCT_SIZE + base + j]);
					}
					wrong++;
				}
			}
		}
	}

	printf("wrong is %d\n\n", wrong);
	total_errors += wrong;

	free(block_s);
	vbx_shared_free(block_v);
	vbx_shared_free(coeff_v);

	vbx_mtx_fdct_free( v );

	VBX_TEST_END(total_errors);

	return (0);
}
Exemplo n.º 22
0
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N )
{

	typedef vbx_mm_t vbx_sp_t;
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;
	// Catch when N is very small
	if( N<4 ) {
		unsigned int i = 0;
		while(i<N) {
			dst[N-i-1]=src[i];
			i++;
		}
		return VBW_SUCCESS;
	}

	vbx_mxp_t *this_mxp          = VBX_GET_THIS_MXP();
	unsigned int SP_WIDTH_B      = this_mxp->scratchpad_alignment_bytes;
	unsigned int FREE_BYTES      = vbx_sp_getfree();


	// Catch when N is small enough that cached scalar does a better job
	if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){
		unsigned int i;
		vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t));
		vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t));
		for( i=0; i<N; i++ ) {
			B[N-i-1]=A[i];
		}
		vbx_dcache_flush(B,N*sizeof(vbx_mm_t));
		return VBW_SUCCESS;
	}

	unsigned int NUM_LANES   = this_mxp->vector_lanes;
	unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B);
	unsigned int tile_size_w = tile_size_b/4;
	unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T;


	unsigned int num_tiles = N / tile_size_t;
	unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B;

	unsigned int tile_part_t = N - num_tiles * tile_size_t;
	unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
		NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
		NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX;


	if(tile_part_t){
		vbx_sp_push();
		vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t));
		vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t));

#if !VBX_SKIP_ALL_CHECKS
		if( !v_0 || !v_1) {
			VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
			VBX_EXIT(-1);
		}
#endif

		vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t));
		vbw_vec_reverse(v_1, v_0, tile_part_t);
		vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t));
		dst += tile_part_t;
		vbx_sp_pop();
	}

	if(!num_tiles) {
		return VBW_SUCCESS;
	}

	vbx_sp_push();
	vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B);
	vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) };
	vbx_word_t *result;

#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[0] || !v_scratch[1] || !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	src += (num_tiles - 1) * tile_size_t;

	if( tile_size_w <= threshold_w) {
		while( num_tiles ) {
			vbx_dma_to_vector( v_scratch[0], src, tile_size_b );
			if(VBW_ROT16){
				vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w);
			}else{
				vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w);
			}
			if( VBW_ROT8){
				vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 );
			}
			vbx_dma_to_host( dst, v_scratch[1], tile_size_b );
			dst += tile_size_t;
			src -= tile_size_t;
			num_tiles--;
		}
	} else {
		while( num_tiles ) {
			vbx_dma_to_vector( v_scratch[0], src, tile_size_b );
			result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B,
			                          rows_per_tile, VBW_ROT16 );
			if(VBW_ROT8){
				vec_rot8_h( result, result, tile_size_w*2 );
			}
			vbx_dma_to_host( dst, result, tile_size_b );
			dst += tile_size_t;
			src -= tile_size_t;
			num_tiles--;
		}
	}

	vbx_sp_pop();
	return VBW_SUCCESS;
}
Exemplo n.º 23
0
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N )
{
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;

	vbx_mxp_t *this_mxp            = VBX_GET_THIS_MXP();
	const unsigned int NUM_LANES   = this_mxp->vector_lanes;

	//printf("\n%d\n",VBX_SKIP_ALL_CHECKS);

	// Can the whole vector fit in the scratchpad width?
	if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){
		vbx_set_vl( 1 );
		vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+N-1, v_src);
		return VBW_SUCCESS;
	}

	unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
	                            NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
	                            NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX);

	unsigned int N_w          = N >> VBW_RSHIFT_T_TO_W;                  // Equivalent number of words in the vector

	if( N_w && N_w <= threshold_w ) {
		if( VBW_ROT16){
			// remainder of elements that can't add to a whole word
			unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T);
			if( stub_t ) {
				vbx_set_vl( 1 );
				vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
				vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t);
				v_dst += stub_t;
			}
			vec_rev_rot16_w(v_dst, v_src, N_w);
		}else{
			vec_rev_w(v_dst, v_src, N_w);
		}

		if( VBW_ROT8){
			vec_rot8_h(v_dst, v_dst, N_w*2);
		}
		return VBW_SUCCESS;
	}


	const unsigned int SP_WIDTH_B       = this_mxp->scratchpad_alignment_bytes;
	const unsigned int FREE_BYTES       = vbx_sp_getfree();
	const unsigned int ODD_LOG_SEL      = NUM_LANES & 0x55555555 ? 1 : 0;

	vbx_word_t *v_mask, *v_result;
	vbx_word_t *v_scratch[2] = {0,0};

	unsigned int num_rows_w    = N_w / NUM_LANES;
	unsigned int working_set_w = num_rows_w * NUM_LANES;
	unsigned int tail_t        = N - (working_set_w << VBW_LSHIFT_W_TO_T);
	unsigned int remaining_w   = working_set_w;

	if( tail_t ) {
		vbx_set_vl( 1 );
		vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t);
		v_dst += tail_t;
	}

	vbx_word_t *v_src_w = (vbx_word_t *)v_src;
	vbx_word_t *v_dst_w = (vbx_word_t *)v_dst;

	if(!num_rows_w) {
		return VBW_SUCCESS;
	}

	remaining_w = working_set_w;
	while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) {
		if( remaining_w <= threshold_w*2 ) {
			if( VBW_ROT16){
				vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w);
			}else{
				vec_rev_w(v_dst_w, v_src_w, remaining_w);
			}

			if( VBW_ROT8){
				vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2);
			}
			return VBW_SUCCESS;
		}

		working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES );
		v_mask = v_dst_w + (working_set_w*2);
		remaining_w -= working_set_w;

		v_scratch[0] = v_dst_w;
		v_scratch[1] = v_dst_w + working_set_w;
		num_rows_w = working_set_w / NUM_LANES;
		v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w,
		                            v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
		if( v_result != v_dst_w ) {
			VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
			VBX_EXIT(-1);
		}
#endif

		if( VBW_ROT8){
			vec_rot8_h(v_result, v_result, working_set_w*2);
		}
		v_dst_w += working_set_w;
	}


	vbx_sp_push();

	v_scratch[0] = v_dst_w;
	v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[1] ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	num_rows_w = remaining_w / NUM_LANES;
	v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL],
	                            v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
	if( v_result != v_dst_w ) {
		VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
		VBX_EXIT(-1);
	}
#endif

	if( VBW_ROT8){
		vec_rot8_h(v_result, v_result, remaining_w*2);
	}
	vbx_sp_pop();
	return VBW_SUCCESS;
}
Exemplo n.º 24
0
int main(void)
{
	vbx_timestamp_t time_start, time_stop;
	double scalar_time, vector_time;

	input_pointer img1;
	input_pointer img2;
	input_pointer sc_img1;
	input_pointer sc_img2;
	output_pointer scalar_out;
	output_pointer vector_out;

	int i,j;

    int total_errors = 0;

    vbx_test_init();

	vbx_mxp_print_params();

	img1       = vbx_shared_malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type)  );
	img2       = vbx_shared_malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type)  );
	vector_out = vbx_shared_malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(output_type) );

	sc_img1    = malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type)  );
	sc_img2    = malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(input_type)  );
	scalar_out = malloc( NUM_OF_ROWS*NUM_OF_COLUMNS*sizeof(output_type) );

	init_img( img1, img2 );
	init_img( sc_img1, sc_img2 );

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_VECTOR_BYTE_LANES = this_mxp->vector_lanes * sizeof(int);

	printf("\n");
	printf("Num of byte lanes: %d\n", VBX_VECTOR_BYTE_LANES);

	printf("Initialized data\n\n");
	printf("Executing Scalar Image Blend...\n");

	vbx_timestamp_start();
	time_start = vbx_timestamp();
	scalar_blend( scalar_out, sc_img1, sc_img2, NUM_OF_ROWS, NUM_OF_COLUMNS, CONST_BLEND );
	time_stop = vbx_timestamp();

	printf("Finished Scalar Image Blend\n");
	scalar_time = vbx_print_scalar_time(time_start, time_stop);

	printf("\nExecuting Vector Image Blend...\n");

	vbx_timestamp_start();
	time_start = vbx_timestamp();
	vector_blend( vector_out, img1, img2, NUM_OF_ROWS, NUM_OF_COLUMNS, CONST_BLEND);
	time_stop = vbx_timestamp();

	printf("Finished Vector Image Blend\n");

	vector_time = vbx_print_vector_time(time_start, time_stop, scalar_time);

	int errors = 0;
	for( j=0; j<NUM_OF_ROWS; j++ ) {
		for( i = 0; i < NUM_OF_COLUMNS; i++ ) {
			if( vector_out[j*NUM_OF_COLUMNS+i] != scalar_out[j*NUM_OF_COLUMNS+i] ) {
				if(errors < 5)
					printf( "\nFail at sample [%3d,%3d].  Scalar: %3d Vector: %3d Img1: %3d Img2: %3d",
						j, i, scalar_out[j*NUM_OF_COLUMNS+i],
						vector_out[j*NUM_OF_COLUMNS+i], img1[j*NUM_OF_COLUMNS+i], img2[j*NUM_OF_COLUMNS+i] );
				errors++;
			}
		}
	}
	printf("\n%d errors\n", errors);
    total_errors += errors;

    VBX_TEST_END(total_errors);

	return 0;
}
Exemplo n.º 25
0
Arquivo: test.c Projeto: 8l/mxp
int main(void)
{

	vbx_test_init();

#if 0
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	int N = VBX_SCRATCHPAD_SIZE/sizeof(vbx_mm_t)/8;
#endif

	int TEST_LENGTH = TEST_ROWS*TEST_COLS;
	int NTAP_LENGTH = NTAP_ROWS*NTAP_COLS;

	int PRINT_COLS = min( TEST_COLS, MAX_PRINT_LENGTH );
	int PRINT_ROWS = min( TEST_ROWS, MAX_PRINT_LENGTH );

	double scalar_time, vector_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nMatrix FIR test...\n" );
	printf( "Matrix dimensions: %d,%d\n", TEST_ROWS, TEST_COLS );

	vbx_mm_t  *scalar_in   = malloc( TEST_LENGTH*sizeof(vbx_mm_t) );
	vbx_mm_t  *vector_in   = vbx_shared_malloc( TEST_LENGTH*sizeof(vbx_mm_t) );

	int32_t *scalar_filt = malloc( NTAP_LENGTH*sizeof(int32_t) );
	int32_t *vector_filt = vbx_shared_malloc( NTAP_LENGTH*sizeof(int32_t) );

	vbx_mm_t  *scalar_out  = malloc( TEST_LENGTH*sizeof(vbx_mm_t) );
	vbx_mm_t  *vector_out  = vbx_shared_malloc( TEST_LENGTH*sizeof(vbx_mm_t) );

	VBX_T(test_zero_array)( scalar_out, TEST_LENGTH );
	VBX_T(test_zero_array)( vector_out, TEST_LENGTH );

	VBX_T(test_init_array)( scalar_in, TEST_LENGTH, 1 );
	VBX_T(test_copy_array)( vector_in, scalar_in, TEST_LENGTH );

	test_init_array_word( scalar_filt, NTAP_LENGTH, 1 );
	test_copy_array_word( vector_filt, scalar_filt, NTAP_LENGTH );

	VBX_T(test_print_matrix)( scalar_in, PRINT_ROWS, PRINT_COLS, TEST_COLS );
	test_print_matrix_word( scalar_filt, NTAP_ROWS, NTAP_COLS, NTAP_COLS );

	scalar_time = test_scalar( scalar_out, scalar_in, scalar_filt,
			TEST_ROWS, TEST_COLS, NTAP_ROWS, NTAP_COLS);
	VBX_T(test_print_matrix)( scalar_out, PRINT_COLS, PRINT_ROWS, TEST_COLS );

	vector_time = test_vector( vector_out, vector_in, vector_filt,
			TEST_ROWS, TEST_COLS, NTAP_ROWS, NTAP_COLS, scalar_time );
	VBX_T(test_print_matrix)( vector_out, PRINT_COLS, PRINT_ROWS, TEST_COLS );

	int i;
	for(i=0; i<TEST_ROWS-NTAP_ROWS; i++){
		errors += VBX_T(test_verify_array)( scalar_out+i*TEST_COLS, vector_out+i*TEST_COLS, TEST_COLS-NTAP_COLS );
	}

	VBX_TEST_END(errors);
	return 0;
}
Exemplo n.º 26
0
int vbw_mtx_median_ext_argb32( unsigned *output, unsigned *input, const int filter_height, const int filter_width,
                           const int image_height, const int image_width, const int image_pitch )
{

	const int FREE_BYTES = vbx_sp_getfree();
	int l,k;
	int filter_mid, filter_size;
	int rows_per_l,vl,temp_vl, temp_vl_byte;
	int j,i;
	int partial_row = 0;

	filter_size = filter_height*filter_width;
	filter_mid = filter_size/2;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_WIDTH_BYTES = this_mxp->scratchpad_alignment_bytes;

	// Could possibly check for low SP here (less than 6*VBX_WIDTH_BYTES) and assign vl differently

	// During allocation, max additional SP bytes needed due to alignment is one VBX_WIDTH_BYTES per vector
	// Taking that off the top simplifies calculation and will always be correct, but sacrifices a little SP space

	vl = (FREE_BYTES-3*VBX_WIDTH_BYTES)/((filter_size+2)*sizeof(vbx_uword_t));

	if( vl < 1 ) {
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	if(vl < image_width){
		rows_per_l = 1;
		partial_row = 1;
	} else {
		rows_per_l = vl/image_width;
		vl = image_width*rows_per_l;
	}

	vbx_sp_push();

	vbx_uword_t *v_input = (vbx_uword_t *)vbx_sp_malloc(filter_size*vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_sub   = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_temp  = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_min, *v_max;
	vbx_ubyte_t *v_input_byte = (vbx_ubyte_t *)v_input;
	if( v_temp == NULL ){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}
	for(l = 0; l < image_height-filter_height; l+= rows_per_l){
		// detect last pass
		if(l+rows_per_l > image_height-filter_height){
			rows_per_l = (image_height-filter_height)-l;
			vl = image_width*rows_per_l;
		}
		temp_vl = vl;
		for(k = 0; k < image_width; k += temp_vl){
			if(partial_row){
				if(k + temp_vl > image_width){
					temp_vl = image_width - k;
				}
			}

			for(j = 0; j < filter_height; j++){
				vbx_dma_to_vector_2D(v_input+temp_vl*j,
									 input+(l+j)*image_pitch+k,
									 temp_vl/rows_per_l*sizeof(vbx_uword_t),
									 rows_per_l,
									 image_width*sizeof(vbx_uword_t),
									 image_pitch*sizeof(vbx_uword_t));
			}

			// arrange all pixels within a filter window into single columns, seperated by temp_vl
			//
			// ex. vl = 5, filter = 3
			// vinput before         vinput after
			//
			// a00 a01 a02 a03 a04 | a00 a01 a02 a03 a04 |
			// a10 a11 a12 a13 a14 | a10 a11 a12 a13 a14 |
			// a20 a21 a22 a23 a24 | a20 a21 a22 a23 a24 |
			// ??? ??? ??? ??? ??? | a01 a02 a03 a04 a10 |
			// ??? ??? ??? ??? ??? | a11 a12 a13 a14 a20 |
			// ??? ??? ??? ??? ??? | a21 a22 a23 a24 a30 |
			// ??? ??? ??? ??? ??? | a02 a03 a04 a10 a11 |
			// ??? ??? ??? ??? ??? | a12 a13 a14 a20 a21 |
			// ??? ??? ??? ??? ??? | a22 a23 a24 a30 a31 |
			//
			vbx_set_vl(temp_vl);
			for(j = 1; j < filter_height; j++){
				for(i = 0; i < filter_width; i++){
					vbx(VVWU, VMOV, v_input+(j*filter_height+i)*temp_vl,
									v_input+i*temp_vl+j,
									0);
				}
			}

			//Do the bubble sort up to the filter_size/2^th element on each vbx

			// work on individual color channels
			temp_vl_byte = temp_vl*sizeof(vbx_uword_t)/sizeof(vbx_ubyte_t);
			vbx_set_vl(temp_vl_byte);

			// sort lower half of the values in the window
			for(j = 0; j < filter_mid; j++){
				v_min = v_input_byte+j*temp_vl_byte;

				for(i = j+1; i < filter_size; i++){
					v_max = v_input_byte+i*temp_vl_byte;

					vbx(VVBU, VMOV,     v_temp, v_min,  0);
					vbx(VVBU, VSUB,     v_sub,  v_max,  v_min);
					vbx(VVBU, VCMV_LTZ, v_min,  v_max,  v_sub);
					vbx(VVBU, VCMV_LTZ, v_max,  v_temp, v_sub);
				}
			}

			// grab next smallest value, the median, don't sort the rest
			v_min = v_input_byte+filter_mid*temp_vl_byte;
			for(i = filter_mid+1; i < filter_size; i++){
				v_max = v_input_byte+i*temp_vl_byte;

				vbx(VVBU, VSUB,     v_sub, v_max, v_min);
				vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub);
			}

			// dma out median value
			// back to pixels
			vbx_dma_to_host_2D(output+(l*image_pitch)+k,
							   v_input+temp_vl*filter_mid,
							   temp_vl/rows_per_l*sizeof(vbx_uword_t),
							   rows_per_l,
							   image_pitch*sizeof(vbx_uword_t),
							   image_width*sizeof(vbx_uword_t));
		}
	}

	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}
Exemplo n.º 27
0
Arquivo: test.c Projeto: 8l/mxp
int main(void)
{

	vbx_timestamp_t time_start, time_stop;
	double scalar_time, vbx_time, vbx_time_masked;
	int i, j, k, l, m, n;
	int errors = 0;

	vbx_test_init();
	vbx_mxp_print_params();
    pixel *input, *scalar_input, *vbx_input, *vbx_input_masked;
    uint16_t *scalar_short;

	input         = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));
	scalar_input  = (pixel *)vbx_remap_cached(input, IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));
	scalar_short  = (uint16_t *)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(uint16_t));
	vbx_input    = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));
	vbx_input_masked  = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));

#if UNIT
    unsigned char *vbx_img8;
    unsigned short *img, *vbx_img;
    unsigned int *iImg, *vbx_iImg;
    unsigned int *iiImg, *vbx_iiImg;
    img = (unsigned short*)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned short));
    vbx_img = (unsigned short*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned short));
    vbx_img8 = (unsigned char*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned char));

    iImg = (unsigned int*)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int));
    vbx_iImg = (unsigned int*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int));

    iiImg = (unsigned int*)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int));
    vbx_iiImg = (unsigned int*)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned int));
#endif//UNIT

	printf("Resolution = %dx%d\n", IMAGE_WIDTH, IMAGE_HEIGHT);
    printf("Initializing data\n");
	vbx_timestamp_start();
    for(l = 0; l < 1; l++){
        char *src;
        char *sdst;
        char *vdst;
        char *mdst;
        if(l == 0){
            load_lenna(input, IMAGE_WIDTH, IMAGE_HEIGHT);
            load_lenna(vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT);
            load_lenna(vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT);
            printf("\nLenna\n");
            src = "lenna";
            sdst = "s_lenna";
            vdst = "v_lenna";
            mdst = "m_lenna";
        }else if(l == 1){
            load_ms(input, IMAGE_WIDTH, IMAGE_HEIGHT);
            load_ms(vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT);
            load_ms(vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT);
            printf("\nMicrosoft\n");
            src = "ms";
            sdst = "s_ms";
            vdst = "v_ms";
            mdst = "m_ms";
        }else if(l == 2){
            load_blank(input, IMAGE_WIDTH, IMAGE_HEIGHT);
            load_blank(vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT);
            load_blank(vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT);
            printf("\nblank\n");
            src = "blank";
            sdst = "s_blank";
            vdst = "v_blank";
            mdst = "m_blank";
        }
#if UNIT
    int window = 20;
    int log=0;
    while(((window/3)>>log) >= 2) log++;


    errors += compare_scalar_rgb2luma_to_vbw_rgb2luma16(img, vbx_img, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, MAX_PRINT_ERRORS);
    vbw_rgb2luma8(vbx_img8, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH);


    int s;
#if LUT_CI
#if DOUBLE_LUT
    printf("Testing double lut\n");

    printf("Assign lbp double lut\n");
    assign_lbp_lut_ci2();
    int prev = errors;
    printf("Cascade check\n");
    /* errors += cascade_check_2w(face_lbp, face_lbp_max_stage, 256); */
    /* errors += cascade_check_2h(face_lbp, face_lbp_max_stage, 256); */
    errors += cascade_check_2b(face_lbp, face_lbp_max_stage, 256);
    if (errors) {
        printf("errors %d\n", errors-prev);
    }
#else
    assign_lbp_lut_ci();

    printf("Testing cascade\n");

    int prev = errors;

    printf("lut check\n");

#if 0
#if 0
    errors += lut_check(256, 0, 0, 0);
    if (errors) {
        printf("errors %d\n", errors-prev);
    }
#elif 1

    int print_errors = 0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int vci_lanes = this_mxp->vcustom0_lanes;
    int num_features = cascade_max_feature();
    int input_length = 10;
    int lut_length = num_features*vci_lanes;
    int lut_iterations = 15;
#if 1
    lut_length = input_length = 128;
    lut_iterations = 13;
    print_errors = 0;
    errors += lut_check2(input_length, lut_length, lut_iterations, print_errors);
    if (errors) {
        printf("errors %d\n", errors-prev);
    }
#elif 1
    input_length = 64;
    lut_length = input_length;
    lut_iterations = 13;
    print_errors = 1;
    errors += lut_check2(input_length, lut_length, lut_iterations, print_errors);
    if (errors) {
        printf("errors %d\n", errors-prev);
    }
#else
    for(s = 2; s < 100; s=s+10){
        errors += lut_check2(s, lut_length, lut_iterations, print_errors);
        if (errors - prev > 0) {
            printf("%d\terrors %d\n", s, errors-prev);
        } else {
            printf("%d\n", s);
        }
        prev = errors;
    }
#endif
#else
    for(s = 0; s < 2000; s=s+100){
        errors += lut_check(s, 0, 0, 0);
        if (errors - prev > 0) {
            printf("%d\terrors %d\n", s, errors-prev);
        } else {
            printf("%d\n", s);
        }
        prev = errors;
    }
#endif

#elif 1

#else
    printf("check cascade\n");
    prev = errors;
    errors += cascade_check(face_lbp, face_lbp_max_stage, 256);
    if (errors) {
        printf("errors %d\n", errors-prev);
    }

    printf("Testing LBP LUT CI\n");
    prev = errors;
    for(s = 0; s < face_lbp_max_stage; s++){
        errors += compare_vbx_lut_to_vbx_lut_ci(s, MAX_PRINT_ERRORS);
    }
    if (errors) {
        printf("errors %d\n", errors-prev);
        prev = errors;
    }
#endif
#endif
#endif

#if 0
    printf("Printing grey scale img\n");
    printf("grey = [");
    for (j = 0; j < IMAGE_HEIGHT; j++) {
        printf("[");
        for (i = 0; i < IMAGE_WIDTH; i++) {
            printf("%d, ", vbx_img8[j*IMAGE_WIDTH+i]);
        }
        printf("],\n");
    }
    printf("]\n");
#endif
#if LBP_CI
    printf("Testing LBP Pattern CI\n");
    errors += compare_LBPRestrictedCI_to_test_scalar_patterns(vbx_img, vbx_img8, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
#endif

#if BLIP
    printf("Testing BLIP\n");
    for(s = 1; s < 10; s++){
        errors += compare_scalar_BLIP2_to_vector_BLIP(img, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS, s);
    }
#endif
#if 0
    errors += compare_LBPRestrictedSums_to_test_scalar_sums_byte(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
    errors += compare_LBPRestrictedSums2_to_test_scalar_sums_half(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
    errors += compare_ScalarLBPRestrictedSums_to_test_scalar_sums_half(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
    errors += compare_ScalarLBPRestrictedPatterns_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
    errors += compare_LBPRestrictedPatterns2_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
    errors += compare_LBPRestricted_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
    /* overflow issues -- using bytes changes lbp pattern */
    errors += compare_LBPRestrictedPatterns_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);

    /* requires SKIP_INTEGRALS 0 */
    errors += compare_gen_integrals_to_vector_get_img(img, iImg, iiImg, vbx_img, vbx_iImg, vbx_iiImg, vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);


    /* redundant test, compare to test_scalar_patterns instead */
    errors += compare_ScalarLBPRestrictedPatterns_to_SATBinaryPattern(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);

    errors += compare_SATBinaryPattern_to_test_scalar_patterns(vbx_img, log, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);

    errors += compare_LBPPassStage_to_restricted(vbx_img, log, face_lbp[0], window, IMAGE_WIDTH, IMAGE_HEIGHT, MAX_PRINT_ERRORS);
#endif
#else // UNIT

#if PRINT
        print_python_pixel(scalar_input, src, IMAGE_WIDTH, IMAGE_HEIGHT);
#endif

        time_start = vbx_timestamp();
        scalar_rgb2luma(scalar_short, input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH);
        scalar_face_detect_luma(scalar_short, input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, sdst);
        time_stop = vbx_timestamp();
        scalar_time = vbx_print_scalar_time(time_start, time_stop);
#if PRINT
        print_python_pixel(scalar_input, sdst, IMAGE_WIDTH, IMAGE_HEIGHT);
#endif
        printf("\nVector");
        time_start = vbx_timestamp();
        vector_face_detect((pixel *)vbx_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, 0, vdst);
        time_stop = vbx_timestamp();
        vbx_time = vbx_print_vector_time(time_start, time_stop, scalar_time);
#if PRINT
        print_python_pixel(vbx_input, vdst, IMAGE_WIDTH, IMAGE_HEIGHT);
#endif

        printf("\nVector Masked");
        time_start = vbx_timestamp();
        vector_face_detect((pixel *)vbx_input_masked, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_WIDTH, 1, mdst);
        time_stop = vbx_timestamp();
        vbx_time_masked = vbx_print_vector_time(time_start, time_stop, scalar_time);
#if PRINT
        print_python_pixel(vbx_input_masked, mdst, IMAGE_WIDTH, IMAGE_HEIGHT);
#endif
        /* errors += match_array_pixel(input, vbx_input, "vector", IMAGE_WIDTH, IMAGE_HEIGHT, 0, MAX_PRINT_ERRORS, 0); */
        /* errors += match_array_pixel(input, vbx_input_masked, "masked", IMAGE_WIDTH, IMAGE_HEIGHT, 0, MAX_PRINT_ERRORS, 0); */
        errors += match_array_pixel(vbx_input, vbx_input_masked, "masked", IMAGE_WIDTH, IMAGE_HEIGHT, 0, MAX_PRINT_ERRORS, 0);
#endif // UNIT
    }
	VBX_TEST_END(errors);
	return errors;
}
Exemplo n.º 28
0
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS )
{
	typedef vbx_mm_t vbx_sp_t;

	int elements = INROWS * INCOLS;

	if(elements < SCALAR_THRESHOLD) {
		vbx_sync();  //in case we input is waiting on a DMA transfer
		int i,j;
		for(i = 0; i < INROWS; i++) {
			for(j = 0; j < INCOLS; j++) {
				out[j*INROWS+i] = in[i*INCOLS+j];
			}
		}
		return VBW_SUCCESS;
	}

	vbx_sp_push();

	vbx_sp_t *v_in;
	vbx_sp_t *v_out;

	int tile_height     = 0;
	int tile_width      = 0;
	int prev_tile_width = 0;
	int tile_y          = 0;
	int tile_x          = 0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes;
	int SP_SIZE = vbx_sp_getfree();
	int max_sp_elements   = vbx_sp_getfree() / sizeof(vbx_sp_t);
	int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t);


	if( INROWS == 1 || INCOLS == 1 ) {           // 1D transpose becomes a simple copy operation
		if( elements <= max_sp_elements ) {      // We can use the whole scratchpad for this
			v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
			vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );
			v_out = v_in;
			vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
		} else {                                 // To test this, you'll need a very large 1D matrix (or a small SP)
			tile_width = max_sp_elements;
			v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) );
			for (tile_x = 0; tile_x < elements; tile_x += tile_width) {
				if( tile_x + tile_width > elements) tile_width = elements - tile_x;
				vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) );
				v_out = v_in;
				vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) );
			}
		}
	} else if( elements < max_tile_elements ) {  // Matrix is small enough to handle entirely in SP
		v_in  = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );

		vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );

		vbw_mtx_xp(v_out,v_in,INROWS,INCOLS);

		vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
	} else {                                     // At this point we know at least one full tile will be needed
		#define QUICK_A_LANES_THRESHOLD 8        // Use merge transpose if there are at least this many lanes
		#define QUICK_A_TILE_WIDTH 128
		#define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH)
		#define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2)
		#define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t)))

		#define QUICK_B_LANES_THRESHOLD 16        // Use smaller merge transpose tile only if there are a lot of lanes
		#define QUICK_B_TILE_WIDTH 64             //     and only if larger tile A size cannot be used.
		#define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH)
		#define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2)
		#define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t)))

		int NUM_LANES = this_mxp->vector_lanes;
		int DMA_BYTES = this_mxp->dma_alignment_bytes;
		int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t);

		vbx_sp_t *v_out_sel;
		vbx_sp_t *vf = 0;

		if( NUM_LANES >= QUICK_A_LANES_THRESHOLD       // Check for appropriate conditions to use merge transpose tiles
					&& INCOLS >= QUICK_A_TILE_WIDTH
					&& INROWS >= QUICK_A_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_A_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD
					&& INCOLS >= QUICK_B_TILE_WIDTH
					&& INROWS >= QUICK_B_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_B_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else {
			findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim );
		}

		prev_tile_width = tile_width;

		v_in  = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );


		if( v_out==NULL ) {
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}

		vbx_sp_t *v[2] = { v_in, v_out };

		tile_y = 0;                              // Reset y position for new col
		while( tile_y < INROWS ) {
		vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
		vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
			tile_x = 0;                          // Reset x position for new row
			while( tile_x < INCOLS ) {

				vbx_dma_to_vector_2D(
						v_in,
						in+(tile_y*INCOLS)+tile_x,
						tile_width*sizeof(vbx_mm_t),
						tile_height,
						tile_width*sizeof(vbx_sp_t),
						INCOLS*sizeof(vbx_mm_t) );

				v_out_sel = v_out;                         // select v_out as default vector to DMA to MM

				/* *** merge transpose (matrix must be square and a power of 2 wide) *** */
				if( vf && tile_width == tile_height
							&& (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) {
					int src = 0;
					int n;
					for( n=1; n<tile_width; n *= 2 ) {     // can't do 1st iteration until entire tile is DMA'd in
						const int nn = 2*n;

						// copy the destination matrix
						vbx_set_vl( tile_width*tile_width );    // use v_in & v_out as working matrices (clobber v_in)
						vbxx(  VMOV, v[!src], v[src]);

						// do the work
						vbx_set_vl( n*tile_width );
						vbxx( VAND, vf, n, (vbx_enum_t*)0 );           // mask for merging: 0101010... then 00110011...
						vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 );
						vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n           , vf );
						vbxx_2D( VCMV_Z, v[!src]+n,            v[src]+n*tile_width, vf );

						src = !src;
					}

					v_out_sel = v[src];     // depending on the size of the mtx, the final result may be in v_in or v_out
				} else {
					vbx_set_vl( 1 );        // 2D and 3D will be set by the x and y edge conditions, even using merge
					vbxx_3D(VMOV, v_out, v_in );
				}

				vbx_dma_to_host_2D(
						out+(tile_x*INROWS)+tile_y,
						v_out_sel,
						tile_height*sizeof(vbx_mm_t),
						tile_width,
						INROWS*sizeof(vbx_mm_t),
						tile_height*sizeof(vbx_sp_t) );

				tile_x += tile_width;                 // Set up width for next tile
				if( tile_x + tile_width > INCOLS ) {  // Temporarily reduce tile width when reaching right edge of matrix
					tile_width = INCOLS - tile_x;
					vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
					vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
				}
			}
			tile_y += tile_height;                    // Set up width and height for next row of tiles
			tile_width = prev_tile_width;             // Restore original tile width for next row of tiles

			/* *** Permanently reduce tile height when reaching bottom of matrix *** */
			tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height;		}
	}
	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}