예제 #1
0
	inline int vec_fir_ext(vbx_mm_t *output, vbx_mm_t *input, vbx_mm_t *coeffs,  int sample_size,  int num_taps)
	{
		vbx_sp_push();
		int ret = vec_fir_tiler<vbx_mm_t,false>(output,input,coeffs,sample_size,num_taps);
		vbx_sp_pop();
		return ret;
	}
예제 #2
0
파일: test.cpp 프로젝트: cirqueit/mxp
double test_vector_sp(vbx_mm_t *vector_out, vbx_mm_t  *vector_in1, int IN1ROWS, int IN1COLS, vbx_mm_t  *vector_in2, int IN2ROWS, int IN2COLS, double scalar_time )
{
	typedef vbx_mm_t vbx_sp_t;
	int retval=-1;
	vbx_timestamp_t time_start, time_stop;
	printf( "\nExecuting MXP matrix multiply... src1[%dx%d] src2[%dx%d]\n",IN1ROWS, IN1COLS,IN2ROWS, IN2COLS );

	vbx_timestamp_start();
	time_start = vbx_timestamp();
	vbx_sp_push();
	vbx_sp_t* v_in1=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN1COLS);
	vbx_sp_t* v_in2=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN2ROWS*IN2COLS);
	vbx_sp_t* v_out=(vbx_sp_t*)vbx_sp_malloc(sizeof(vbx_sp_t)*IN1ROWS*IN2COLS);
	if(v_out!=NULL){
		vbx_dma_to_vector(v_in1,vector_in1,sizeof(vbx_sp_t)*IN1ROWS*IN1COLS);
		vbx_dma_to_vector(v_in2,vector_in2,sizeof(vbx_sp_t)*IN2ROWS*IN2COLS);
		retval = vbw_mtx_mul( v_out, v_in1, IN1ROWS, IN1COLS, v_in2, IN2ROWS, IN2COLS );
		vbx_dma_to_host(vector_out,v_out,sizeof(vbx_sp_t)*IN1ROWS*IN2COLS);
		vbx_sync();
	}else{
		printf("not enough sp space for sp test");
	}
	time_stop = vbx_timestamp();
	printf( "...done. retval:0x%08X\n", retval );
	return vbx_print_vector_time( time_start, time_stop, scalar_time );
}
예제 #3
0
파일: test.c 프로젝트: 8l/mxp
int deep_vector_copy_test()
{
	int retval;
	int num_test;
	int total_errors = 0;
	const int NUM_TESTS = TEST_DEEP_SP_NUM_TESTS;
	const int NB = vbx_sp_getfree();

	int NT = NB / sizeof(vbx_sp_t);

	vbx_sp_push();
	vbx_sp_t *v = vbx_sp_malloc( NB );

	srand( 0x1a84c92a );

	for( num_test=0; num_test < NUM_TESTS ; num_test++ ) {

		// initialize entire available scratchpad
		vbx_set_vl( NT );
		vbx( SE(T), VAND, v, MSK, 0 );

		// choose random src/dest/length:
		// -- randomly pick the dest
		// -- set a window size of 2*K around the dest
		// -- randomly pick the src within the window
		// -- randomly pick the length, subject to end-of-scratchpad
		// -- this 'window' rule increases probability of overlaps
		// -- rough distribution: 30% short (pipeline) overlaps, 20% long overlaps, 50% no overlap

		int K, N1, N2, NN;
		N1 = rand() % NT;
		K  = 1 + rand() % ((N1 > 0)? min(min(N1, NT-N1), 1024): min(NT, 1024));
		N2 = N1 - K + rand() % (2*K);
		NN = rand() % (NT - max(N1,N2));
		vbx_sp_t *dst = v + N1;
		vbx_sp_t *src = v + N2;

		printf("test:%d src:0x%08x dst:0x%08x len:%08d", num_test, N1, N2, NN );

		// do the copy
		retval = VBX_T(vbw_vec_copy)( dst, src, NN );
		vbx_sync();
		printf(" retval:0x%04x\n",retval);

		// ensure the copy was done properly
		int errors = verify_copy((vbx_mm_t *)v,     0,    N1,       0, "head")
		           + verify_copy((vbx_mm_t *)v,    N1, NN+N1, (N2-N1), "copy")
		           + verify_copy((vbx_mm_t *)v, NN+N1,    NT,       0, "tail");
		total_errors += errors;
		if( errors ) {
			//break;
		}
	}

	vbx_sp_pop();
	return total_errors;
}
예제 #4
0
파일: vbx_mtx_fdct.c 프로젝트: 8l/mxp
vbx_mtx_fdct_t *
vbx_mtx_fdct_init( dt *coeff_v, dt *image )
{
	const int BIG_TILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE;
	const int num_bytes = BIG_TILE_SIZE * sizeof(dt);
	const int co_bytes = NUM_TILE_X* DCT_SIZE *sizeof(dt);

	//compute coeffs matrix in double and truncated to dt
	int i, j;
	double s;
	for (i = 0; i < BLOCK_SIZE; i++) {
		s = (i == 0) ? sqrt(0.125) : 0.5;
		for (j = 0; j < BLOCK_SIZE; j++) {
			c2[i][j] = s * cos((double) ((PI / 8.0) * i * j + 0.5));
			cs[i][j] = (dt) (c2[i][j] * SHIFT_DOUBLE + 0.499999);
		}
	}

	vbx_sp_push();

	vbx_mtx_fdct_t *v = vbx_shared_malloc( sizeof(vbx_mtx_fct_t) );

	v->vcoeff    = (vbx_half_t *)vbx_sp_malloc( co_bytes );
	v->vprods    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
#if USE_ACCUM_FLAGS
	v->vaccum    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vflags    = (vbx_half_t *)vbx_sp_malloc( num_bytes );
#endif

	// interleave ordering to ensure no false hazards
	v->vblock[2] = (vbx_half_t *)vbx_sp_malloc( num_bytes );

	v->vimage[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vblock[0] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vimage[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	v->vblock[1] = (vbx_half_t *)vbx_sp_malloc( num_bytes );
	if( !v->vblock[1] ) {
		VBX_PRINTF( "ERROR: out of memory.\n" );
		VBX_EXIT(-1);
	}
	vbx_dma_to_vector( v->vcoeff, coeff_v, co_bytes );

	int row;
	for( row=0; row < BLOCK_SIZE; row++ ) {
		getBigTileImageY(v->vimage[v->db],image,row);
	}
#if USE_ACCUM_FLAGS 
	// create a flag vector first element 0, next 'BLOCK_SIZE-1' element non-zero, etc
	vbx_set_vl( NUM_TILE_X * BLOCK_SIZE * NUM_TILE_Y * BLOCK_SIZE - (BLOCK_SIZE-1) );
	vbx( SEH, VAND,   v->vflags,       BLOCK_SIZE-1,      0 );
#endif

	return v;
}
예제 #5
0
int vbw_sobel_argb32_3x3(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm)
{
	size_t free_sp=vbx_sp_getfree();
	size_t vectors_needed=8;
	size_t partial_width=free_sp/(vectors_needed*sizeof(vbx_uword_t));
	if(partial_width>image_width){
		vbw_sobel_argb32_3x3_partial(output, input, image_width, image_height, image_pitch,renorm);
	}else{
		//can do entire row at a time, so do partial_width at a time
		size_t partial_step=partial_width-2;
		int i;
		for(i=0;;i+=partial_step){
			//account for last tile being smaller
			if(i+partial_width > image_width){
				partial_width=image_width-i;
			}

			vbw_sobel_argb32_3x3_partial(output+i, input+i, partial_width, image_height, image_pitch,renorm);

			if(i+partial_width == image_width){
				//that was the last tile, so break,
				//I don't believe that this can be in the for statement
				break;
			}
		}
	}
	vbx_sp_push();
	vbx_word_t* side=vbx_sp_malloc(sizeof(vbx_word_t));
	vbx_set_vl(1);
	vbx(SVW,VMOV,side,0,0);
	vbx_dma_to_host_2D(output,/*host_ptr*/
	                   side,/*sp_ptr*/
	                   sizeof(vbx_word_t),/*row len*/
	                   image_height,/*num rows*/
	                   image_pitch*sizeof(vbx_word_t),/*host_incr*/
	                   0);/*sp incr*/
	vbx_dma_to_host_2D(output+image_width-1,/*host_ptr*/
	                   side,/*sp_ptr*/
	                   sizeof(vbx_word_t),/*row len*/
	                   image_height,/*num rows*/
	                   image_pitch*sizeof(vbx_word_t),/*host_incr*/
	                   0);/*sp incr*/
	vbx_sp_pop();
	vbx_sync();

}
예제 #6
0
파일: vbw_fix16.c 프로젝트: gplhegde/mxp
void vbw_fix16_sqrt( vbx_word_t* v_out, vbx_word_t* v_x, int length)
{
  vbx_sp_push();
  //vbx_word_t* v_tmp = (vbx_word_t *)vbx_sp_malloc(sizeof(vbx_word_t)*length*11);
  vbx_word_t* v_tmp = (vbx_word_t *)vbx_sp_malloc(sizeof(vbx_word_t)*length*10);
  vbx_word_t* v_result   = v_tmp + 0*length;
  vbx_uword_t* v_bit      = (vbx_uword_t*)v_tmp + 1*length;
  vbx_word_t* v_num      = v_tmp + 2*length;
  vbx_uword_t* v_else_num = (vbx_uword_t*)v_tmp + 3*length;

  vbx_uword_t* v_t_bit    = (vbx_uword_t*)v_tmp + 4*length;
  vbx_uword_t* v_t_num    = (vbx_uword_t*)v_tmp + 5*length;

  vbx_uword_t* v_t_add    = (vbx_uword_t*)v_tmp + 6*length;
  vbx_word_t* v_t_sub    = v_tmp + 7*length;
  vbx_uword_t* v_t_result = (vbx_uword_t*)v_tmp + 8*length;
  vbx_uword_t* v_if_num   = (vbx_uword_t*)v_tmp + 9*length;
  //vbx_word_t* v_neg   = v_tmp + 10*length;

  v_result = v_out;

	//uint8_t  neg = (inValue < 0);
  //vbx(SVW, VMOV, v_neg, 0, 0 );
  //vbx(SVW, VCMV_LTZ, v_neg, 1, v_x);

	//uint32_t num = (neg ? -inValue : inValue);
  vbx(SVW, VABSDIFF, v_num, 0, v_x);
	//uint32_t result = 0;
  vbx(SVW, VMOV, v_result, 0, 0 );
	//uint32_t bit;
  vbx(SVWU, VMOV, v_bit, (1<<30), 0 );

  //*
	// Many numbers will be less than 15, so
	// this gives a good balance between time spent
	// in if vs. time spent in the while loop
	// when searching for the starting value.
  /*
	if (num & 0xFFF00000)
		bit = (uint32_t)1 << 30;
	else
		bit = (uint32_t)1 << 18;
        */


//	while (bit > num) bit >>= 2;

  int i, max_iter;
  max_iter = 16; //1<<30 and >>2 every iter, so max iter = 30/2 + 1
  for(i=0; i<max_iter; i++){
    vbx(VVW, VSUB, v_t_sub, (vbx_word_t*)v_bit, v_num);
    vbx(SVWU, VSHR, v_t_bit, 2, v_bit);
    vbx(VVW, VCMV_GTZ, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_bit, v_t_sub);
  }

	// The main part is executed twice, in order to avoid
	// using 64 bit values in computations.
  /*
		while (bit)
		{
			if (num >= result + bit)
			{
				num -= result + bit;
				result = (result >> 1) + bit;
			}
			else
			{
				result = (result >> 1);
			}
			bit >>= 2;
		}
  */
  max_iter = 16;
  for(i=0; i<max_iter; i++){

      //v_result + bit
    vbx(VVW, VADD, (vbx_word_t*)v_t_add, (vbx_word_t*)v_bit, v_result);
      //v_num - (v_result + bit)
    vbx(VVW, VSUB, v_t_sub, v_num, (vbx_word_t*)v_t_add);

    //if (v_num - (v_result + bit) >= 0) v_num = v_num - (v_result + bit)
    vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_num, v_t_sub, v_t_sub);
    //else v_num stays
    vbx(VVW, VCMV_LTZ, (vbx_word_t*)v_t_num, v_num, v_t_sub);

    vbx(SVW, VSHR, (vbx_word_t*)v_t_result, 1, v_result);
    vbx(VVW, VADD, (vbx_word_t*)v_t_add, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_result);
    //if (v_num - (v_result + bit) >= 0) v_result = v_result >> 1 + bit
    //else  v_result >> 1
    vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_result, (vbx_word_t*)v_t_add, v_t_sub);

    vbx(SVW, VSHR, (vbx_word_t*)v_t_bit, 2, (vbx_word_t*)v_bit);

    vbx(VVW, VCMV_GTZ, v_num, (vbx_word_t*)v_t_num, (vbx_word_t*)v_bit);
    vbx(VVW, VCMV_GTZ, v_result, (vbx_word_t*)v_t_result, (vbx_word_t*)v_bit);
    vbx(VVW, VCMV_GTZ, (vbx_word_t*)v_bit, (vbx_word_t*)v_t_bit, (vbx_word_t*)v_bit);
  }

  //vbx(SVW, VSHL, v_result, 8, v_result);

//#if 0
 /*
  if (num > 65535)
  {
    // The remainder 'num' is too large to be shifted left
    // by 16, so we have to add 1 to result manually and
    // adjust 'num' accordingly.
    // num = a - (result + 0.5)^2
    //	 = num + result^2 - (result + 0.5)^2
    //	 = num - result - 0.5
    num -= result;
    num = (num << 16) - 0x8000;
    result = (result << 16) + 0x8000;
  }
  else
  {
    num <<= 16;
    result <<= 16;
  }

  bit = 1 << 14;
  */
  vbx(SVW, VSUB, v_t_sub, 65535, v_num);
  vbx(VVWU, VSUB, v_if_num, (vbx_uword_t*)v_num, (vbx_uword_t*)v_result);
  vbx(SVWU, VSHL, v_if_num, 16, v_if_num);
  vbx(SVWU, VADD, v_if_num, (-1*(0x8000)), v_if_num);

  vbx(SVWU, VSHL, v_t_result, 16, (vbx_uword_t*)v_result);
  vbx(SVWU, VADD, v_t_add, (0x8000), v_t_result);
  vbx(SVWU, VSHL, v_else_num, 16, (vbx_uword_t*)v_num);

  vbx(VVWU, VCMV_LTZ, (vbx_uword_t*)v_num, v_if_num, (vbx_uword_t*)v_t_sub);
  vbx(VVWU, VCMV_GEZ, (vbx_uword_t*)v_num, v_else_num, (vbx_uword_t*)v_t_sub);
  vbx(VVWU, VCMV_LTZ, (vbx_uword_t*)v_result, v_t_add, (vbx_uword_t*)v_t_sub);
  vbx(VVWU, VCMV_GEZ, (vbx_uword_t*)v_result, v_t_result, (vbx_uword_t*)v_t_sub);

  vbx(SVWU, VMOV, v_bit, (1<<14), 0);

  max_iter = 8; //1<<14 and >>2 every iter, so 14/2 + 1
  for(i=0; i<max_iter; i++){

    vbx(VVWU, VADD, v_t_add, v_bit, (vbx_uword_t*)v_result);
    vbx(VVWU, VSUB, (vbx_uword_t*)v_t_sub, (vbx_uword_t*)v_num, v_t_add);

    vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_num, v_t_sub, v_t_sub);
    vbx(VVW, VCMV_LTZ, (vbx_word_t*)v_t_num, v_num, v_t_sub);

    vbx(SVWU, VSHR, v_t_result, 1, (vbx_uword_t*)v_result);
    vbx(VVWU, VADD, v_t_add, v_bit, v_t_result);
    vbx(VVW, VCMV_GEZ, (vbx_word_t*)v_t_result, (vbx_word_t*)v_t_add, v_t_sub);

    vbx(SVWU, VSHR, v_t_bit, 2, v_bit);

    vbx(VVWU, VCMV_NZ, (vbx_uword_t*)v_num, v_t_num, v_bit);
    vbx(VVWU, VCMV_NZ, (vbx_uword_t*)v_result, v_t_result, v_bit);
    vbx(VVWU, VCMV_NZ, v_bit, v_t_bit, v_bit);
  }

#ifndef FIXMATH_NO_ROUNDING
  /*
	// Finally, if next bit would have been 1, round the result upwards.
	if (num > result)
	{
		result++;
	}
  */
  vbx(VVW, VSUB, v_t_sub, v_num, v_result);
  vbx(SVW, VADD, (vbx_word_t*)v_t_result, 1, v_result);
  vbx(VVW, VCMV_GTZ, v_result, (vbx_word_t*)v_t_result, v_t_sub);
#endif

  /*
	return (neg ? -result : result);
  */
  vbx(SVW, VSUB, (vbx_word_t*)v_t_result, 0, v_result);
  vbx(VVW, VCMV_LTZ, v_result, (vbx_word_t*)v_t_result, v_x);

  vbx_sp_pop();
}
예제 #7
0
파일: vbw_vec_rev.cpp 프로젝트: 8l/mxp
int vbw_vec_reverse_ext( vbx_mm_t *dst, vbx_mm_t *src, const unsigned int N )
{

	typedef vbx_mm_t vbx_sp_t;
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;
	// Catch when N is very small
	if( N<4 ) {
		unsigned int i = 0;
		while(i<N) {
			dst[N-i-1]=src[i];
			i++;
		}
		return VBW_SUCCESS;
	}

	vbx_mxp_t *this_mxp          = VBX_GET_THIS_MXP();
	unsigned int SP_WIDTH_B      = this_mxp->scratchpad_alignment_bytes;
	unsigned int FREE_BYTES      = vbx_sp_getfree();


	// Catch when N is small enough that cached scalar does a better job
	if( N <= MM_CACHED_SCALAR_THRESHOLD || FREE_BYTES < SP_WIDTH_B*5 ){
		unsigned int i;
		vbx_mm_t *A = (vbx_mm_t*)vbx_remap_cached(src,N*sizeof(vbx_mm_t));
		vbx_mm_t *B = (vbx_mm_t*)vbx_remap_cached(dst,N*sizeof(vbx_mm_t));
		for( i=0; i<N; i++ ) {
			B[N-i-1]=A[i];
		}
		vbx_dcache_flush(B,N*sizeof(vbx_mm_t));
		return VBW_SUCCESS;
	}

	unsigned int NUM_LANES   = this_mxp->vector_lanes;
	unsigned int tile_size_b = VBX_PAD_DN(((FREE_BYTES-SP_WIDTH_B)/2),SP_WIDTH_B);
	unsigned int tile_size_w = tile_size_b/4;
	unsigned int tile_size_t = tile_size_w << VBW_LSHIFT_W_TO_T;


	unsigned int num_tiles = N / tile_size_t;
	unsigned int rows_per_tile = tile_size_b / SP_WIDTH_B;

	unsigned int tile_part_t = N - num_tiles * tile_size_t;
	unsigned int threshold_w = NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
		NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
		NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX;


	if(tile_part_t){
		vbx_sp_push();
		vbx_sp_t *v_0 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t));
		vbx_sp_t *v_1 = (vbx_sp_t *)vbx_sp_malloc(tile_part_t*sizeof(vbx_sp_t));

#if !VBX_SKIP_ALL_CHECKS
		if( !v_0 || !v_1) {
			VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
			VBX_EXIT(-1);
		}
#endif

		vbx_dma_to_vector(v_0, src+N-tile_part_t, tile_part_t*sizeof(vbx_mm_t));
		vbw_vec_reverse(v_1, v_0, tile_part_t);
		vbx_dma_to_host(dst, v_1, tile_part_t*sizeof(vbx_sp_t));
		dst += tile_part_t;
		vbx_sp_pop();
	}

	if(!num_tiles) {
		return VBW_SUCCESS;
	}

	vbx_sp_push();
	vbx_word_t *v_mask = (vbx_word_t *)vbx_sp_malloc(SP_WIDTH_B);
	vbx_word_t *v_scratch[2] = { (vbx_word_t *)vbx_sp_malloc(tile_size_b), (vbx_word_t *)vbx_sp_malloc(tile_size_b) };
	vbx_word_t *result;

#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[0] || !v_scratch[1] || !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	src += (num_tiles - 1) * tile_size_t;

	if( tile_size_w <= threshold_w) {
		while( num_tiles ) {
			vbx_dma_to_vector( v_scratch[0], src, tile_size_b );
			if(VBW_ROT16){
				vec_rev_rot16_w(v_scratch[1], v_scratch[0], tile_size_w);
			}else{
				vec_rev_w(v_scratch[1], v_scratch[0], tile_size_w);
			}
			if( VBW_ROT8){
				vec_rot8_h( v_scratch[1], v_scratch[1], tile_size_w*2 );
			}
			vbx_dma_to_host( dst, v_scratch[1], tile_size_b );
			dst += tile_size_t;
			src -= tile_size_t;
			num_tiles--;
		}
	} else {
		while( num_tiles ) {
			vbx_dma_to_vector( v_scratch[0], src, tile_size_b );
			result = vec_rev_merge_w( v_scratch[1], v_scratch[0], tile_size_w, v_scratch[0], v_mask, SP_WIDTH_B,
			                          rows_per_tile, VBW_ROT16 );
			if(VBW_ROT8){
				vec_rot8_h( result, result, tile_size_w*2 );
			}
			vbx_dma_to_host( dst, result, tile_size_b );
			dst += tile_size_t;
			src -= tile_size_t;
			num_tiles--;
		}
	}

	vbx_sp_pop();
	return VBW_SUCCESS;
}
예제 #8
0
파일: vbw_vec_rev.cpp 프로젝트: 8l/mxp
int vbw_vec_reverse( vbx_sp_t *v_dst, vbx_sp_t *v_src, const unsigned int N )
{
	const int VBW_ROT16= sizeof(vbx_sp_t) <=sizeof(vbx_half_t);
	const int VBW_ROT8= sizeof(vbx_sp_t)== sizeof(vbx_byte_t);
	const int VBW_RSHIFT_T_TO_W= (sizeof(vbx_sp_t)==sizeof(vbx_word_t)? 0:
	                              sizeof(vbx_sp_t)==sizeof(vbx_half_t)? 1:/*byte_sized*/2);
	const int VBW_LSHIFT_W_TO_T= VBW_RSHIFT_T_TO_W;

	vbx_mxp_t *this_mxp            = VBX_GET_THIS_MXP();
	const unsigned int NUM_LANES   = this_mxp->vector_lanes;

	//printf("\n%d\n",VBX_SKIP_ALL_CHECKS);

	// Can the whole vector fit in the scratchpad width?
	if( N < (NUM_LANES << VBW_LSHIFT_W_TO_T) ){
		vbx_set_vl( 1 );
		vbx_set_2D( N, (int)-sizeof(vbx_sp_t), (int)sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+N-1, v_src);
		return VBW_SUCCESS;
	}

	unsigned int threshold_w = (NUM_LANES >= 32 ? VL1_THRESHOLD_V32_UP :
	                            NUM_LANES == 16 ? VL1_THRESHOLD_V16    :
	                            NUM_LANES == 8  ? VL1_THRESHOLD_V8     : UINT_MAX);

	unsigned int N_w          = N >> VBW_RSHIFT_T_TO_W;                  // Equivalent number of words in the vector

	if( N_w && N_w <= threshold_w ) {
		if( VBW_ROT16){
			// remainder of elements that can't add to a whole word
			unsigned int stub_t = N - (N_w << VBW_LSHIFT_W_TO_T);
			if( stub_t ) {
				vbx_set_vl( 1 );
				vbx_set_2D( stub_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
				vbxx_2D(VMOV, v_dst+stub_t-1, v_src+N-stub_t);
				v_dst += stub_t;
			}
			vec_rev_rot16_w(v_dst, v_src, N_w);
		}else{
			vec_rev_w(v_dst, v_src, N_w);
		}

		if( VBW_ROT8){
			vec_rot8_h(v_dst, v_dst, N_w*2);
		}
		return VBW_SUCCESS;
	}


	const unsigned int SP_WIDTH_B       = this_mxp->scratchpad_alignment_bytes;
	const unsigned int FREE_BYTES       = vbx_sp_getfree();
	const unsigned int ODD_LOG_SEL      = NUM_LANES & 0x55555555 ? 1 : 0;

	vbx_word_t *v_mask, *v_result;
	vbx_word_t *v_scratch[2] = {0,0};

	unsigned int num_rows_w    = N_w / NUM_LANES;
	unsigned int working_set_w = num_rows_w * NUM_LANES;
	unsigned int tail_t        = N - (working_set_w << VBW_LSHIFT_W_TO_T);
	unsigned int remaining_w   = working_set_w;

	if( tail_t ) {
		vbx_set_vl( 1 );
		vbx_set_2D( tail_t, (int)-sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 );
		vbxx_2D(VMOV, v_dst+tail_t-1, v_src+N-tail_t);
		v_dst += tail_t;
	}

	vbx_word_t *v_src_w = (vbx_word_t *)v_src;
	vbx_word_t *v_dst_w = (vbx_word_t *)v_dst;

	if(!num_rows_w) {
		return VBW_SUCCESS;
	}

	remaining_w = working_set_w;
	while( remaining_w*sizeof(vbx_word_t) + SP_WIDTH_B > FREE_BYTES ) {
		if( remaining_w <= threshold_w*2 ) {
			if( VBW_ROT16){
				vec_rev_rot16_w(v_dst_w, v_src_w, remaining_w);
			}else{
				vec_rev_w(v_dst_w, v_src_w, remaining_w);
			}

			if( VBW_ROT8){
				vec_rot8_h(v_dst_w, v_dst_w, remaining_w*2);
			}
			return VBW_SUCCESS;
		}

		working_set_w = VBX_PAD_DN( (remaining_w - NUM_LANES)/2, NUM_LANES );
		v_mask = v_dst_w + (working_set_w*2);
		remaining_w -= working_set_w;

		v_scratch[0] = v_dst_w;
		v_scratch[1] = v_dst_w + working_set_w;
		num_rows_w = working_set_w / NUM_LANES;
		v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w + remaining_w, working_set_w,
		                            v_scratch[!ODD_LOG_SEL], v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
		if( v_result != v_dst_w ) {
			VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
			VBX_EXIT(-1);
		}
#endif

		if( VBW_ROT8){
			vec_rot8_h(v_result, v_result, working_set_w*2);
		}
		v_dst_w += working_set_w;
	}


	vbx_sp_push();

	v_scratch[0] = v_dst_w;
	v_scratch[1] = (vbx_word_t*)vbx_sp_malloc( remaining_w * sizeof(vbx_word_t) );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_scratch[1] ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	v_mask = (vbx_word_t*)vbx_sp_malloc( SP_WIDTH_B );
#if !VBX_SKIP_ALL_CHECKS
	if( !v_mask ) {
		VBX_PRINTF("vbx_sp_malloc failed when it was predetermined to have enough space.");
		VBX_EXIT(-1);
	}
#endif

	num_rows_w = remaining_w / NUM_LANES;
	v_result = vec_rev_merge_w( v_scratch[ODD_LOG_SEL], v_src_w, remaining_w, v_scratch[!ODD_LOG_SEL],
	                            v_mask, SP_WIDTH_B, num_rows_w, VBW_ROT16 );
#if !VBX_SKIP_ALL_CHECKS
	if( v_result != v_dst_w ) {
		VBX_PRINTF("Unexpected behavior: merge reverse returned the wrong vector. Parameter order was chosen based on NUM_LANES.");
		VBX_EXIT(-1);
	}
#endif

	if( VBW_ROT8){
		vec_rot8_h(v_result, v_result, remaining_w*2);
	}
	vbx_sp_pop();
	return VBW_SUCCESS;
}
예제 #9
0
/** Luma Edge Detection
 *
 * @brief 3x3 Sobel edge detection with 32-bit aRGB image
 *
 * @param[out] output      32-bit aRGB edge-intensity output
 * @param[in] input        32-bit aRGB input
 * @param[in] image_width  Image width in pixels
 * @param[in] image_height Image height in pixels
 * @param[in] image_pitch  Distance in pixels between the start of subsequent rows. usually equal to image_width
 * @param[in] renorm       Number of bits to shift the final intensity by to the right
 * @returns Negative on error condition. See vbw_exit_codes.h
 */
int vbw_sobel_argb32_3x3_partial(unsigned *output, unsigned *input, const short image_width, const short image_height, const short image_pitch, const short renorm)
{

	int y;

	vbx_uword_t *v_row_in;
	vbx_uhalf_t *v_luma_top, *v_luma_mid, *v_luma_bot;
	vbx_uword_t *v_row_out;

	vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot;
	vbx_uhalf_t *v_gradient_x, *v_gradient_y;
	vbx_uhalf_t *v_tmp;

	void *tmp_ptr;

	vbx_sp_push();

	// Allocate space in scratchpad for vectors
	struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t),
	                                                          input,input+image_pitch*image_width,
	                                                          image_pitch*sizeof(vbx_uword_t));

	v_luma_top      = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_luma_mid      = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_luma_bot      = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_top = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_mid = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_bot = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_row_out       = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t));

	if(v_row_out==NULL){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	// Re-use v_sobel_row_bot as v_tmp
	v_tmp = v_sobel_row_bot;

	// Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations
	rp_fetch(&v_row_db);
	rp_fetch(&v_row_db);
	v_row_in=rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(v_luma_top, v_row_in, v_tmp, image_width);                                          // 1st luma row


	vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top, image_width);                                     // 1st partial sobel row
	rp_fetch(&v_row_db);
	v_row_in=rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(v_luma_mid, v_row_in, v_tmp, image_width);                               // 2nd luma row
	vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width);                                     // 2nd partial sobel row

	// Set top output row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t));

	// Calculate edges
	for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) {
		// Transfer the next input row while processing
		rp_fetch(&v_row_db);
		v_row_in=rp_get_buffer(&v_row_db,0);
// Re-use v_sobel_row_bot as v_tmp
		v_tmp = v_sobel_row_bot;

		// Convert aRGB input to luma
		vbw_rgb2luma(v_luma_bot, v_row_in, v_tmp, image_width);
		// Done with v_row_in; re-use for v_gradient_x and v_gradient_y (be careful!)
		v_gradient_x = (vbx_uhalf_t *)v_row_in;
		v_gradient_y = (vbx_uhalf_t *)v_row_in + image_width;

		// Calculate gradient_x
		// Apply [1 2 1]T matrix to all columns
		vbx_set_vl(image_width);
		vbx(SVHU, VSHL, v_gradient_x, 1,          v_luma_mid); // multiply by 2
		vbx(VVHU, VADD, v_tmp,        v_luma_top, v_luma_bot);
		vbx(VVHU, VADD, v_tmp,        v_tmp,      v_gradient_x);
		// For each column, calculate absolute difference with 2nd column to the right
		vbx_set_vl(image_width-2);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2);

		// Calculate gradient_y
		// Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row
		vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot);

		// Re-use v_sobel_row_top as v_tmp
		v_tmp = v_sobel_row_top;

		// sum of absoute gradients
		vbx_set_vl(image_width-2);
		vbx(VVHU, VADD, v_tmp, v_gradient_x,  v_gradient_y);
		vbx(SVHU, VSHR, v_tmp, renorm, v_tmp);

		// Threshold
		vbx(SVHU, VSUB,     v_gradient_y, 255, v_tmp);
		vbx(SVHU, VCMV_LTZ, v_tmp,        255, v_gradient_y);

		// Copy the result to the low byte of the output row
		// Trick to copy the low byte (b) to the middle two bytes as well
		// Note that first and last columns are 0
		vbx_set_vl(image_width-2);
		vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp);

		// DMA the result to the output (minus the outside two pixels
		vbx_dma_to_host(output+(y+1)*image_pitch+1, v_row_out+1, (image_width-2)*sizeof(vbx_uword_t));

		// Rotate luma buffers
		tmp_ptr      = (void *)v_luma_top;
		v_luma_top   = v_luma_mid;
		v_luma_mid   = v_luma_bot;
		v_luma_bot   = (vbx_uhalf_t *)tmp_ptr;

		// Rotate v_sobel_row buffers (for gradient_y)
		tmp_ptr         = (void *)v_sobel_row_top;
		v_sobel_row_top = v_sobel_row_mid;
		v_sobel_row_mid = v_sobel_row_bot;
		v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr;
	}

	// Set bottom row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t));

	vbx_sp_pop();

	return VBW_SUCCESS;
}
예제 #10
0
/** Luma Edge Detection.
 *
 * @brief 3x3 Sobel edge detection with 8-bit luma image
 *
 * @param[out] output      32-bit aRGB edge-intensity output
 * @param[in] input        8-bit luma input
 * @param[in] image_width  Image width in pixels
 * @param[in] image_height Image height in pixels
 * @param[in] image_pitch  Distance in pixels between the start of subsequent rows. usually equal to image_width
 * @param[in] renorm       Number of bits to shift the final intensity by to the right
 * @returns Negative on error condition. See vbw_exit_codes.h
 */
int vbw_sobel_luma8_3x3(unsigned *output, unsigned char *input, const short image_width, const short image_height, const short image_pitch, const short renorm)
{
	int y;

	vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot;
	vbx_uword_t *v_row_out;

	vbx_uhalf_t *v_sobel_row_top, *v_sobel_row_mid, *v_sobel_row_bot;
	vbx_uhalf_t *v_gradient_x, *v_gradient_y;
	vbx_uhalf_t *v_tmp;

	void *tmp_ptr;

	vbx_sp_push();

	// Allocate space in scratchpad for vectors
	rotating_prefetcher_t v_luma=rotating_prefetcher(3,image_width*sizeof(vbx_ubyte_t),
	                                                 input,input+image_height*image_pitch,
	                                                 image_pitch*sizeof(vbx_ubyte_t));
	v_sobel_row_top = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_mid = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_sobel_row_bot = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_gradient_x    = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_gradient_y    = (vbx_uhalf_t *)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	v_row_out       = (vbx_uword_t *)vbx_sp_malloc(image_width*sizeof(vbx_uword_t));
	if(v_row_out==NULL) {
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	// Transfer the first 3 input rows and interleave first 2 sobel row calculations
	rp_fetch(&v_luma);
	rp_fetch(&v_luma);
	v_luma_top=rp_get_buffer(&v_luma, 0);
	vbw_sobel_3x3_row(v_sobel_row_top, v_luma_top,image_width);
	rp_fetch(&v_luma);
	v_luma_mid=rp_get_buffer(&v_luma, 1);
	vbw_sobel_3x3_row(v_sobel_row_mid, v_luma_mid, image_width);

	// Set top output row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output, v_row_out, image_width*sizeof(vbx_uword_t));

	// Calculate edges
	for (y = 0; y < image_height-(FILTER_HEIGHT-1); y++) {
		// Transfer the next input row while processing
		rp_fetch(&v_luma);
		v_luma_top=rp_get_buffer(&v_luma,0);
		v_luma_mid=rp_get_buffer(&v_luma,1);
		v_luma_bot=rp_get_buffer(&v_luma,2);
		// Start calculating gradient_x
		vbx_set_vl(image_width);
		vbx(SVBHU, VSHL, v_gradient_x, 1, v_luma_mid); // multiply by 2

		// Calculate gradient_y
		// Apply [1 2 1] matrix to last row in window and calculate absolute difference with pre-computed first row
		vbw_sobel_3x3_row(v_sobel_row_bot, v_luma_bot, image_width);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_y, (vbx_half_t*)v_sobel_row_top, (vbx_half_t*)v_sobel_row_bot);

		// Re-use v_sobel_row_top
		v_tmp = v_sobel_row_top;

		// Finish calculating gradient_x
		// Apply [1 2 1]T matrix to all columns
		vbx_set_vl(image_width);
		vbx(VVBHU, VADD, v_tmp, v_luma_top, v_luma_bot);
		vbx(VVHU,  VADD, v_tmp, v_tmp,      v_gradient_x);
		// For each column, calculate absolute difference with 2nd column to the right
		vbx_set_vl(image_width-2);
		vbx(VVH, VABSDIFF, (vbx_half_t*)v_gradient_x, (vbx_half_t*)v_tmp, (vbx_half_t*)v_tmp+2);

		// sum of absoute gradients
		//vbx_set_vl(image_width-2);
		vbx(VVHU, VADD, v_tmp, v_gradient_x,  v_gradient_y);
		vbx(SVHU, VSHR, v_tmp, renorm, v_tmp);

		// Threshold
		vbx(SVHU, VSUB,     v_gradient_y, 255, v_tmp);
		vbx(SVHU, VCMV_LTZ, v_tmp,        255, v_gradient_y);

		// Copy the result to the low byte of the output row
		// Trick to copy the low byte (b) to the middle two bytes as well
		// Note that first and last columns are 0
		//vbx_set_vl(image_width-2);
		vbx(SVHWU, VMULLO, v_row_out+1, 0x00010101, v_tmp);

		// DMA the result to the output
		vbx_dma_to_host(output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t));


		// Rotate v_sobel_row buffers (for gradient_y)
		tmp_ptr         = (void *)v_sobel_row_top;
		v_sobel_row_top = v_sobel_row_mid;
		v_sobel_row_mid = v_sobel_row_bot;
		v_sobel_row_bot = (vbx_uhalf_t *)tmp_ptr;
	}

	// Set bottom row to 0
	vbx_set_vl(image_width);
	vbx(SVWU, VMOV, v_row_out, 0, 0);
	vbx_dma_to_host(output+(image_height-1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t));

	vbx_sync();
	vbx_sp_pop();

	return VBW_SUCCESS;
}
예제 #11
0
int vbw_mtx_xp_ext(vbx_mm_t *out, vbx_mm_t *in, const int INROWS, const int INCOLS )
{
	typedef vbx_mm_t vbx_sp_t;

	int elements = INROWS * INCOLS;

	if(elements < SCALAR_THRESHOLD) {
		vbx_sync();  //in case we input is waiting on a DMA transfer
		int i,j;
		for(i = 0; i < INROWS; i++) {
			for(j = 0; j < INCOLS; j++) {
				out[j*INROWS+i] = in[i*INCOLS+j];
			}
		}
		return VBW_SUCCESS;
	}

	vbx_sp_push();

	vbx_sp_t *v_in;
	vbx_sp_t *v_out;

	int tile_height     = 0;
	int tile_width      = 0;
	int prev_tile_width = 0;
	int tile_y          = 0;
	int tile_x          = 0;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int SP_WIDTH_B = this_mxp->scratchpad_alignment_bytes;
	int SP_SIZE = vbx_sp_getfree();
	int max_sp_elements   = vbx_sp_getfree() / sizeof(vbx_sp_t);
	int max_tile_elements = VBX_PAD_DN( SP_SIZE/2, SP_WIDTH_B ) / sizeof(vbx_sp_t);


	if( INROWS == 1 || INCOLS == 1 ) {           // 1D transpose becomes a simple copy operation
		if( elements <= max_sp_elements ) {      // We can use the whole scratchpad for this
			v_in = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
			vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );
			v_out = v_in;
			vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
		} else {                                 // To test this, you'll need a very large 1D matrix (or a small SP)
			tile_width = max_sp_elements;
			v_in = (vbx_sp_t*)vbx_sp_malloc( tile_width * sizeof(vbx_sp_t) );
			for (tile_x = 0; tile_x < elements; tile_x += tile_width) {
				if( tile_x + tile_width > elements) tile_width = elements - tile_x;
				vbx_dma_to_vector( v_in, in + tile_x, tile_width*sizeof(vbx_mm_t) );
				v_out = v_in;
				vbx_dma_to_host( out+tile_x, v_out, tile_width*sizeof(vbx_mm_t) );
			}
		}
	} else if( elements < max_tile_elements ) {  // Matrix is small enough to handle entirely in SP
		v_in  = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( elements * sizeof(vbx_sp_t) );

		vbx_dma_to_vector( v_in, in, elements*sizeof(vbx_mm_t) );

		vbw_mtx_xp(v_out,v_in,INROWS,INCOLS);

		vbx_dma_to_host( out, v_out, elements*sizeof(vbx_mm_t) );
	} else {                                     // At this point we know at least one full tile will be needed
		#define QUICK_A_LANES_THRESHOLD 8        // Use merge transpose if there are at least this many lanes
		#define QUICK_A_TILE_WIDTH 128
		#define QUICK_A_TILE_ELEMENTS (QUICK_A_TILE_WIDTH*QUICK_A_TILE_WIDTH)
		#define QUICK_A_VF_ELEMENTS (QUICK_A_TILE_ELEMENTS/2)
		#define QUICK_A_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_A_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_A_VF_ELEMENTS,sizeof(vbx_sp_t)))

		#define QUICK_B_LANES_THRESHOLD 16        // Use smaller merge transpose tile only if there are a lot of lanes
		#define QUICK_B_TILE_WIDTH 64             //     and only if larger tile A size cannot be used.
		#define QUICK_B_TILE_ELEMENTS (QUICK_B_TILE_WIDTH*QUICK_B_TILE_WIDTH)
		#define QUICK_B_VF_ELEMENTS (QUICK_B_TILE_ELEMENTS/2)
		#define QUICK_B_REQ_ELEMENTS (2*VBX_PAD_UP(QUICK_B_TILE_ELEMENTS,SP_WIDTH_B/sizeof(vbx_sp_t)) + VBX_PAD_UP(QUICK_B_VF_ELEMENTS,sizeof(vbx_sp_t)))

		int NUM_LANES = this_mxp->vector_lanes;
		int DMA_BYTES = this_mxp->dma_alignment_bytes;
		int min_tile_dim = DMA_BYTES / sizeof(vbx_sp_t);

		vbx_sp_t *v_out_sel;
		vbx_sp_t *vf = 0;

		if( NUM_LANES >= QUICK_A_LANES_THRESHOLD       // Check for appropriate conditions to use merge transpose tiles
					&& INCOLS >= QUICK_A_TILE_WIDTH
					&& INROWS >= QUICK_A_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_A_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_A_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_A_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else if( NUM_LANES >= QUICK_B_LANES_THRESHOLD
					&& INCOLS >= QUICK_B_TILE_WIDTH
					&& INROWS >= QUICK_B_TILE_WIDTH
			&& (unsigned)max_sp_elements >= QUICK_B_REQ_ELEMENTS ) {
			tile_width = tile_height = QUICK_B_TILE_WIDTH;
			vf = (vbx_sp_t *)vbx_sp_malloc( QUICK_B_VF_ELEMENTS * sizeof(vbx_sp_t));
		} else {
			findTileSize( &tile_height, &tile_width, INROWS, INCOLS, max_tile_elements, min_tile_dim );
		}

		prev_tile_width = tile_width;

		v_in  = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );
		v_out = (vbx_sp_t*)vbx_sp_malloc( tile_height*tile_width * sizeof(vbx_sp_t) );


		if( v_out==NULL ) {
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}

		vbx_sp_t *v[2] = { v_in, v_out };

		tile_y = 0;                              // Reset y position for new col
		while( tile_y < INROWS ) {
		vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
		vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
			tile_x = 0;                          // Reset x position for new row
			while( tile_x < INCOLS ) {

				vbx_dma_to_vector_2D(
						v_in,
						in+(tile_y*INCOLS)+tile_x,
						tile_width*sizeof(vbx_mm_t),
						tile_height,
						tile_width*sizeof(vbx_sp_t),
						INCOLS*sizeof(vbx_mm_t) );

				v_out_sel = v_out;                         // select v_out as default vector to DMA to MM

				/* *** merge transpose (matrix must be square and a power of 2 wide) *** */
				if( vf && tile_width == tile_height
							&& (tile_width==QUICK_A_TILE_WIDTH || tile_width==QUICK_B_TILE_WIDTH) ) {
					int src = 0;
					int n;
					for( n=1; n<tile_width; n *= 2 ) {     // can't do 1st iteration until entire tile is DMA'd in
						const int nn = 2*n;

						// copy the destination matrix
						vbx_set_vl( tile_width*tile_width );    // use v_in & v_out as working matrices (clobber v_in)
						vbxx(  VMOV, v[!src], v[src]);

						// do the work
						vbx_set_vl( n*tile_width );
						vbxx( VAND, vf, n, (vbx_enum_t*)0 );           // mask for merging: 0101010... then 00110011...
						vbx_set_2D( tile_width/nn, nn*tile_width*sizeof(vbx_sp_t), nn*tile_width*sizeof(vbx_sp_t), 0 );
						vbxx_2D( VCMV_Z, v[!src]+n*tile_width, v[src]+n           , vf );
						vbxx_2D( VCMV_Z, v[!src]+n,            v[src]+n*tile_width, vf );

						src = !src;
					}

					v_out_sel = v[src];     // depending on the size of the mtx, the final result may be in v_in or v_out
				} else {
					vbx_set_vl( 1 );        // 2D and 3D will be set by the x and y edge conditions, even using merge
					vbxx_3D(VMOV, v_out, v_in );
				}

				vbx_dma_to_host_2D(
						out+(tile_x*INROWS)+tile_y,
						v_out_sel,
						tile_height*sizeof(vbx_mm_t),
						tile_width,
						INROWS*sizeof(vbx_mm_t),
						tile_height*sizeof(vbx_sp_t) );

				tile_x += tile_width;                 // Set up width for next tile
				if( tile_x + tile_width > INCOLS ) {  // Temporarily reduce tile width when reaching right edge of matrix
					tile_width = INCOLS - tile_x;
					vbx_set_2D( tile_width, tile_height*sizeof(vbx_sp_t), sizeof(vbx_sp_t), sizeof(vbx_sp_t) );
					vbx_set_3D( tile_height, sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t), tile_width*sizeof(vbx_sp_t) );
				}
			}
			tile_y += tile_height;                    // Set up width and height for next row of tiles
			tile_width = prev_tile_width;             // Restore original tile width for next row of tiles

			/* *** Permanently reduce tile height when reaching bottom of matrix *** */
			tile_height = ( tile_y + tile_height > INROWS ) ? INROWS - tile_y : tile_height;		}
	}
	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}
예제 #12
0
int vbw_bifilt_argb32_3x3(unsigned *output, unsigned *input, short image_width, const short image_height, const short image_pitch, const short renorm)
{

//return vbw_sobel_argb32_3x3( output, input, image_width, image_height, image_pitch, renorm);

	int y;
	int xx, yy, sharp;

	vbx_uword_t *v_row_in;
	vbx_ubyte_t *v_luma_top, *v_luma_mid, *v_luma_bot;
	vbx_ubyte_t *v_luma_hii,              *v_luma_low;
	vbx_ubyte_t *v_src[W][W];

	vbx_uword_t *v_row_out;

	vbx_ubyte_t *v00, *v01, *v02, *v10, *v11, *v12, *v20, *v21, *v22;
#if W==5
	vbx_ubyte_t *v03, *v04,       *v13, *v14,       *v23, *v24;
	vbx_ubyte_t *v30, *v31, *v32, *v40, *v41, *v42;
	vbx_ubyte_t *v33, *v34,       *v43, *v44;
#endif
	vbx_ubyte_t *v[W][W];

	vbx_uhalf_t *vI, *vW, *vT;  // vT== temporary


	vbx_sp_push();

	// Allocate space in scratchpad for vectors
	struct rotating_prefetcher_t v_row_db=rotating_prefetcher(1,image_width*sizeof(vbx_uword_t),
	                                                          input,input+image_height*image_pitch,
	                                                          image_pitch*sizeof(vbx_uword_t));

	v_row_out  = (vbx_uword_t*)vbx_sp_malloc(image_width*sizeof(vbx_uword_t));
	vT         = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
#if 1
	// save some space by overlapping with v_row_out
	vW         = (vbx_uhalf_t*)v_row_out;
	vI         = (vbx_uhalf_t*)v_row_out + image_width;
#else
	vW         = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
	vI         = (vbx_uhalf_t*)vbx_sp_malloc(image_width*sizeof(vbx_uhalf_t));
#endif

#if W==3
	v_luma_top      = (vbx_ubyte_t*)vbx_sp_malloc( 3 * image_width*sizeof(vbx_ubyte_t));
	v_luma_mid      = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_bot      = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ;
#else
	v_luma_top      = (vbx_ubyte_t*)vbx_sp_malloc( 5 * image_width*sizeof(vbx_ubyte_t));
	v_luma_hii      = v_luma_top + 1 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_mid      = v_luma_top + 2 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_low      = v_luma_top + 3 * image_width*sizeof(vbx_ubyte_t) ;
	v_luma_bot      = v_luma_top + 4 * image_width*sizeof(vbx_ubyte_t) ;
#endif


	if(v_luma_bot==NULL){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	// Transfer the first 3 input rows and interleave first 2 rgb2luma and first 2 sobel row calculations
#if W==3
	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(vW, v_row_in, vT, image_width);                                // 1st luma row
	vbx( SVHBU, VSHR, v_luma_top, 8, vW );                                     // convert to byte

	v_row_in = rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_mid, 8,  vW );                                    // convert to byte

#else
	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma(vW, v_row_in, vT, image_width);                                // 1st luma row
	vbx( SVHBU, VSHR, v_luma_top, 8, vW );                                     // convert to byte

	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_hii, 8,  vW );                                    // convert to byte

	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_mid, 8,  vW );                                    // convert to byte

	rp_fetch(&v_row_db);
	v_row_in = rp_get_buffer(&v_row_db,0);
	vbw_rgb2luma( vW, v_row_in, vT, image_width);                               // 2nd luma row
	vbx( SVHBU, VSHR, v_luma_low, 8,  vW );                                    // convert to byte
#endif


	// blank out the top and bottom rows
	unsigned *out;
	vbx_set_vl(image_width);
	unsigned COLOUR = ( 200 | (128<<8) | (244<<16) );
	vbx(SVWU, VMOV, v_row_out, COLOUR, 0);
	for( y=0; y<W/2; y++ ) {
		// Set top output rows to 0
		out = output + image_width*y;
		vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) );
		// Set bottom rows to 0
		out = output + image_width*(image_height-1-y);
		vbx_dma_to_host( out, v_row_out, image_width*sizeof(vbx_uword_t) );
	}



	// Calculate edges
	for (y = 0; y < image_height-(W-1); y++) {

		vbx_set_vl(image_width);
		// Transfer the next input row while processing
		rp_fetch(&v_row_db);
		v_row_in = rp_get_buffer(&v_row_db,0);
		// Convert aRGB input to luma
		vbw_rgb2luma( vW, v_row_in, vT, image_width);
		vbx( SVHBU, VSHR, v_luma_bot, 8,  vW );                                     // convert to byte

vbx_sp_push();
		image_width=image_width/2;
		vbx_set_vl(image_width);

		v[0][0] = v00   = (vbx_ubyte_t*)vbx_sp_malloc( 25 * image_width*sizeof(vbx_ubyte_t));
		v[0][1] = v01   = v00 +  1 * image_width*sizeof(vbx_ubyte_t) ;
		v[0][2] = v02   = v00 +  2 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][0] = v10   = v00 +  3 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][1] = v11   = v00 +  4 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][2] = v12   = v00 +  5 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][0] = v20   = v00 +  6 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][1] = v21   = v00 +  7 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][2] = v22   = v00 +  8 * image_width*sizeof(vbx_ubyte_t) ;

	#if W==5
		v[0][3] = v03   = v00 +  9 * image_width*sizeof(vbx_ubyte_t) ;
		v[0][4] = v04   = v00 + 10 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][3] = v13   = v00 + 11 * image_width*sizeof(vbx_ubyte_t) ;
		v[1][4] = v14   = v00 + 12 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][3] = v23   = v00 + 13 * image_width*sizeof(vbx_ubyte_t) ;
		v[2][4] = v24   = v00 + 14 * image_width*sizeof(vbx_ubyte_t) ;

		v[3][0] = v30   = v00 + 15 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][1] = v31   = v00 + 16 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][2] = v32   = v00 + 17 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][3] = v33   = v00 + 18 * image_width*sizeof(vbx_ubyte_t) ;
		v[3][4] = v34   = v00 + 19 * image_width*sizeof(vbx_ubyte_t) ;

		v[4][0] = v40   = v00 + 20 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][1] = v41   = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][2] = v42   = v00 + 22 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][3] = v43   = v00 + 23 * image_width*sizeof(vbx_ubyte_t) ;
		v[4][4] = v44   = v00 + 24 * image_width*sizeof(vbx_ubyte_t) ;
	#endif

		if(v00==NULL){
printf("mem alloc failed\n"); fflush(stdout);
			vbx_sp_pop();
			vbx_sp_pop();
			return VBW_ERROR_SP_ALLOC_FAILED;
		}


//FIXME -- how to manage row buffers with 5 rows?  3 rows are shown below:
#if W==3
		for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx;
		for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_mid+xx;
		for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_bot+xx;
#else
		for( xx=0; xx<W; xx++ ) v_src[0][xx] = v_luma_top+xx;
		for( xx=0; xx<W; xx++ ) v_src[1][xx] = v_luma_hii+xx;
		for( xx=0; xx<W; xx++ ) v_src[2][xx] = v_luma_mid+xx;
		for( xx=0; xx<W; xx++ ) v_src[3][xx] = v_luma_low+xx;
		for( xx=0; xx<W; xx++ ) v_src[4][xx] = v_luma_bot+xx;
#endif

		vbx_set_vl( image_width - W + 1 );

		// compute error (absdiff) in pixel colour with neighbours
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( VVBU, VABSDIFF, v[yy][xx], v_luma_mid+(W/2), v_src[yy][xx] );
			}
		}


		// v[][] holds the errors (differences) between pixels
		// efficiently compute a function that looks approximately something like exp(-x):
		//     large value for small errors, small value for big errors
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( SVBU, VABSDIFF, v[yy][xx], 255, v[yy][xx] );  // 255 - img_err
				// 11 or more iterations is mathematically equivalent to a pure gaussian blur // FIXME is this true?
#define NUM_SHARPEN_ITERATIONS  3   // 0 to 10 iterations, practical max is 7 or 8
				for( sharp=0; sharp < NUM_SHARPEN_ITERATIONS; sharp++ ) {
					vbx( VVBU, VMULHI, v[yy][xx], v[yy][xx], v[yy][xx] ); // v*v;
				}
			}
		}

		// with right decimal place, could do the next two instructions using MULFXP and do as BYTES
		// convolve errors with gaussian blur kernel
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v[yy][xx] );
			}
		}

		// sum up the weights for normalization later
		vbx( VVBHU, VADD, vW, v[0][0], v[0][1] );
		vbx( VVBHU, VADD, vT, v[0][2], v[1][0] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[1][1], v[1][2] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[2][0], v[2][1] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[2][2], 0 );
		vbx( VVHU,  VADD, vW, vW, vT );
	#if (W==5)
		vbx( VVBHU, VADD, vT, v[3][0], v[3][1] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[3][2], v[4][0] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VADD, vT, v[4][1], v[4][2] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
		vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] );
		vbx( VVHU,  VADD, vW, vW, vT );
	#endif


		// convolve image with new weights
		for( yy=0; yy<W; yy++ ) {
			for( xx=0; xx<W; xx++ ) {
				vbx( VVBU, VMULHI, v[yy][xx], v_src[yy][xx], v[yy][xx] );
				//vbx( SVBU, VMULHI, v[yy][xx], gauss[yy][xx], v_src[yy][xx] );
				//vbx( SVBU, VMUL  , v[yy][xx],         1      , v_src[yy][xx] );
			}
		}



		// sum up the weighted pixels
		vbx( VVBHU, VADD, vI, v[0][0], v[0][1] );
		vbx( VVBHU, VADD, vT, v[0][2], v[1][0] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[1][1], v[1][2] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[2][0], v[2][1] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[2][2], 0 );
		vbx( VVHU,  VADD, vI, vI, vT );

	#if (W==5)
		vbx( VVBHU, VADD, vT, v[3][0], v[3][1] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[3][2], v[4][0] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VADD, vT, v[4][1], v[4][2] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[0][3], v[0][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[1][3], v[1][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[2][3], v[2][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[3][3], v[3][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
		vbx( VVBHU, VMOV, vT, v[4][3], v[4][4] );
		vbx( VVHU,  VADD, vI, vI, vT );
	#endif


// keep RHS of image as original grayscale
image_width=image_width*2;
vbx_set_vl( image_width/2 );
//vbx( VVWHU, VMOV, vT+image_width/2, (v_row_in       ) + image_width/2+1, 0 );
vbx( VVBHU, VMOV, vT+image_width/2, (v_src[ 0 ][ 0 ]) + image_width/2+1, 0 );
vbx_sp_pop(); // don't need v[][] data any more

// compute LHS of image
#if 0
		vbx( VVBHU, VMOV, vT, v_src[2][2], 0 );
		//vbx( SVHU, VSHR, vI,  3, vI );
		//vbx( SVHU, VSHR, vW,  3, vW );
		//vbx( VVHU, VMUL, vT, vI, vW );
		//vbx( SVHU, VSHR, vT,  8, vT );
#else
		uint32_t h = image_width/2;
		vbx( SVHU, VADD, vW, 0x80, vW ); // round
		vbx( SVHU, VSHR, vW,    8, vW );
		vbw_vec_divide_uhalf( vT  , vI  , vW  , h                 );
		//vbw_vec_divide_uhalf( vT+h, vI+h, vW+h, image_width-W+1-h );
#endif
		// ensure LHS doesn't overflow
		vbx( SVHU, VAND, vT, 0xff, vT );

		// Copy the result to the low byte of the output row
		// Trick to copy the low byte (b) to the middle two bytes as well
		// Note that first and last columns are 0
		vbx_set_vl(image_width-W+1);
		vbx(SVHWU, VMULLO, v_row_out+W/2, 0x00010101, vT);

		// blank out left and right edges
		// then DMA the result to the output
		vbx_set_vl(W/2);
		vbx(SVWU, VMOV, v_row_out, COLOUR, 0 );
		vbx(SVWU, VMOV, v_row_out + image_width - (W/2), COLOUR, 0 );
		vbx_dma_to_host( output+(y+1)*image_pitch, v_row_out, image_width*sizeof(vbx_uword_t) );

		// Rotate luma buffers
		vbx_ubyte_t *tmp_ptr;
		tmp_ptr      = v_luma_top;
#if W==3
		v_luma_top   = v_luma_mid;
		v_luma_mid   = v_luma_bot;
		v_luma_bot   = tmp_ptr;
#else
		v_luma_top   = v_luma_hii;
		v_luma_hii   = v_luma_mid;
		v_luma_mid   = v_luma_low;
		v_luma_low   = v_luma_bot;
		v_luma_bot   = tmp_ptr;
#endif

	}

	vbx_sync();
	vbx_sp_pop();

	return VBW_SUCCESS;
}
예제 #13
0
int vbw_mtx_median_ext_argb32( unsigned *output, unsigned *input, const int filter_height, const int filter_width,
                           const int image_height, const int image_width, const int image_pitch )
{

	const int FREE_BYTES = vbx_sp_getfree();
	int l,k;
	int filter_mid, filter_size;
	int rows_per_l,vl,temp_vl, temp_vl_byte;
	int j,i;
	int partial_row = 0;

	filter_size = filter_height*filter_width;
	filter_mid = filter_size/2;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_WIDTH_BYTES = this_mxp->scratchpad_alignment_bytes;

	// Could possibly check for low SP here (less than 6*VBX_WIDTH_BYTES) and assign vl differently

	// During allocation, max additional SP bytes needed due to alignment is one VBX_WIDTH_BYTES per vector
	// Taking that off the top simplifies calculation and will always be correct, but sacrifices a little SP space

	vl = (FREE_BYTES-3*VBX_WIDTH_BYTES)/((filter_size+2)*sizeof(vbx_uword_t));

	if( vl < 1 ) {
		return VBW_ERROR_SP_ALLOC_FAILED;
	}

	if(vl < image_width){
		rows_per_l = 1;
		partial_row = 1;
	} else {
		rows_per_l = vl/image_width;
		vl = image_width*rows_per_l;
	}

	vbx_sp_push();

	vbx_uword_t *v_input = (vbx_uword_t *)vbx_sp_malloc(filter_size*vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_sub   = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_temp  = (vbx_ubyte_t *)vbx_sp_malloc(vl*sizeof(vbx_uword_t));
	vbx_ubyte_t *v_min, *v_max;
	vbx_ubyte_t *v_input_byte = (vbx_ubyte_t *)v_input;
	if( v_temp == NULL ){
		vbx_sp_pop();
		return VBW_ERROR_SP_ALLOC_FAILED;
	}
	for(l = 0; l < image_height-filter_height; l+= rows_per_l){
		// detect last pass
		if(l+rows_per_l > image_height-filter_height){
			rows_per_l = (image_height-filter_height)-l;
			vl = image_width*rows_per_l;
		}
		temp_vl = vl;
		for(k = 0; k < image_width; k += temp_vl){
			if(partial_row){
				if(k + temp_vl > image_width){
					temp_vl = image_width - k;
				}
			}

			for(j = 0; j < filter_height; j++){
				vbx_dma_to_vector_2D(v_input+temp_vl*j,
									 input+(l+j)*image_pitch+k,
									 temp_vl/rows_per_l*sizeof(vbx_uword_t),
									 rows_per_l,
									 image_width*sizeof(vbx_uword_t),
									 image_pitch*sizeof(vbx_uword_t));
			}

			// arrange all pixels within a filter window into single columns, seperated by temp_vl
			//
			// ex. vl = 5, filter = 3
			// vinput before         vinput after
			//
			// a00 a01 a02 a03 a04 | a00 a01 a02 a03 a04 |
			// a10 a11 a12 a13 a14 | a10 a11 a12 a13 a14 |
			// a20 a21 a22 a23 a24 | a20 a21 a22 a23 a24 |
			// ??? ??? ??? ??? ??? | a01 a02 a03 a04 a10 |
			// ??? ??? ??? ??? ??? | a11 a12 a13 a14 a20 |
			// ??? ??? ??? ??? ??? | a21 a22 a23 a24 a30 |
			// ??? ??? ??? ??? ??? | a02 a03 a04 a10 a11 |
			// ??? ??? ??? ??? ??? | a12 a13 a14 a20 a21 |
			// ??? ??? ??? ??? ??? | a22 a23 a24 a30 a31 |
			//
			vbx_set_vl(temp_vl);
			for(j = 1; j < filter_height; j++){
				for(i = 0; i < filter_width; i++){
					vbx(VVWU, VMOV, v_input+(j*filter_height+i)*temp_vl,
									v_input+i*temp_vl+j,
									0);
				}
			}

			//Do the bubble sort up to the filter_size/2^th element on each vbx

			// work on individual color channels
			temp_vl_byte = temp_vl*sizeof(vbx_uword_t)/sizeof(vbx_ubyte_t);
			vbx_set_vl(temp_vl_byte);

			// sort lower half of the values in the window
			for(j = 0; j < filter_mid; j++){
				v_min = v_input_byte+j*temp_vl_byte;

				for(i = j+1; i < filter_size; i++){
					v_max = v_input_byte+i*temp_vl_byte;

					vbx(VVBU, VMOV,     v_temp, v_min,  0);
					vbx(VVBU, VSUB,     v_sub,  v_max,  v_min);
					vbx(VVBU, VCMV_LTZ, v_min,  v_max,  v_sub);
					vbx(VVBU, VCMV_LTZ, v_max,  v_temp, v_sub);
				}
			}

			// grab next smallest value, the median, don't sort the rest
			v_min = v_input_byte+filter_mid*temp_vl_byte;
			for(i = filter_mid+1; i < filter_size; i++){
				v_max = v_input_byte+i*temp_vl_byte;

				vbx(VVBU, VSUB,     v_sub, v_max, v_min);
				vbx(VVBU, VCMV_LTZ, v_min, v_max, v_sub);
			}

			// dma out median value
			// back to pixels
			vbx_dma_to_host_2D(output+(l*image_pitch)+k,
							   v_input+temp_vl*filter_mid,
							   temp_vl/rows_per_l*sizeof(vbx_uword_t),
							   rows_per_l,
							   image_pitch*sizeof(vbx_uword_t),
							   image_width*sizeof(vbx_uword_t));
		}
	}

	vbx_sp_pop();
	vbx_sync();
	return VBW_SUCCESS;
}