int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int width, int height, int max_print_errors) { int j, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(img, 0, width, height); vbx_ubyte_t* v_in = (vbx_ubyte_t*)vbx_sp_malloc(3*width*sizeof(vbx_word_t)); vbx_ubyte_t* v_top = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_ubyte_t* v_bot = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lbp = v_bot; unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); vbx_set_vl(width); for(j=0; j < height - 2; j++){ vbx_dma_to_vector(v_in, img+j*width, 3*width*sizeof(unsigned char)); vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+width); vbx(VVHU, VCUSTOM1, v_bot, v_in+width, v_in+2*width); vbx(SVHBU, VAND, v_top, 0xf0, v_top); vbx(SVHBU, VAND, v_bot, 0x0f, v_bot); vbx(VVBU, VADD, v_lbp, v_bot, v_top); vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char)); vbx_sync(); errors = match_array_byte(lbp, scalar_patterns[0]+j*width, "custom_lbp", width-2, 1, max_print_errors, 1, j); } vbx_sp_free(); vbx_shared_free(lbp); return errors; }
int dma_bandwidth_test() { const int num_iter = 64; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int scratchpad_size = this_mxp->scratchpad_size; uint8_t *buf = vbx_shared_malloc(scratchpad_size); vbx_ubyte_t *v_buf = vbx_sp_malloc(scratchpad_size); vbx_timestamp_t time_start, time_stop; int i; int len; int to_host; int errors = 0; vbx_mxp_print_params(); // dma_alignment_bytes gives DMA master data bus width in bytes. double bytes_per_sec = \ (((double) this_mxp->core_freq) * this_mxp->dma_alignment_bytes); double max_megabytes_per_sec = bytes_per_sec/(1024*1024); printf("\nMax available bandwidth = %s Megabytes/s\n", vbx_eng(max_megabytes_per_sec, 4)); printf("\n"); for (to_host = 0; to_host < 2; to_host++) { for (len = 32; len <= scratchpad_size ; len *= 2) { printf("DMA %s, %d bytes\n", to_host ? "write" : "read", len); vbx_timestamp_start(); if (to_host) { time_start = vbx_timestamp(); for (i = 0; i < num_iter; i++) { vbx_dma_to_host(buf, v_buf, len); } vbx_sync(); time_stop = vbx_timestamp(); } else { time_start = vbx_timestamp(); for (i = 0; i < num_iter; i++) { vbx_dma_to_vector(v_buf, buf, len); } vbx_sync(); time_stop = vbx_timestamp(); } print_dma_bandwidth(time_start, time_stop, len, num_iter, max_megabytes_per_sec); printf("\n"); } printf("\n"); } vbx_shared_free(buf); vbx_sp_free(); return errors; }
int compare_vbx_lut_to_vbx_lut_ci(int sz, int max_print_errors) { int f, n, errors; vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); for (n = 0; n < sz; n++) { v_pattern[n] = n & 0xff; } int s, stage = 11; for (f = 0; f < face_lbp[stage].count; f++) { lbp_feat_t feat = face_lbp[stage].feats[f]; vbx_set_vl(sz); int total = f; s = 0; while(s < stage){ total += face_lbp[s].count; s++; } vbx(SVBU, VCUSTOM0, v_lutc, total, v_pattern); vbx(SVB, VMOV, v_pass, feat.fail, 0); /* check if pattern is in lut */ vbx(SVBU, VSHR, v_group, 5, v_pattern); for (n = 0; n < 8; n++) { vbx(SVB, VADD, v_sel, -n, v_group); vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel); } vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern); vbx(VVWB, VSHR, v_lut, v_idx, v_lut); vbx(SVB, VAND, v_lut, 1, v_lut); vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut); vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char)); vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char)); vbx_sync(); errors = match_array_byte(lut_c, lut, "custom_lut", sz, 1, max_print_errors, 0, 0); } vbx_sp_free(); vbx_shared_free(lut); vbx_shared_free(lut_c); return errors; }
void init_vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int x, int y, const int image_pitch) { vbx_uhalf_t *v_last_luma; vbx_uhalf_t *v_row_temp; vbx_uword_t *v_row; if(*last_luma == NULL){ *last_luma = malloc(MOTEST_BLOCK_SIZE*sizeof(luma_type)); } *motest_x = x-(MOTEST_BLOCK_WIDTH/2); *motest_y = y-(MOTEST_BLOCK_HEIGHT/2); v_last_luma = vbx_sp_malloc(MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t)); v_row_temp = vbx_sp_malloc(MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t)); v_row = vbx_sp_malloc(MOTEST_BLOCK_WIDTH*sizeof(vbx_uword_t)); vector_rectangle_to_luma(input_buffer, v_last_luma, v_row_temp, v_row, *motest_x, *motest_y, MOTEST_BLOCK_WIDTH, MOTEST_BLOCK_HEIGHT, image_pitch); vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type)); vbx_sp_free(); }
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int log, int width, int height, int max_print_errors) { int j, l, cell, max_cell, errors = 0; unsigned char** scalar_patterns = test_scalar_patterns(img, log, width, height); max_cell = 1<<log; vbx_uhalf_t* v_in = (vbx_uhalf_t*)vbx_sp_malloc((1+2*max_cell)*width*sizeof(vbx_half_t)); vbx_uhalf_t* v_top = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t)); vbx_uhalf_t* v_bot = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t)); vbx_ubyte_t* v_lbp = (vbx_ubyte_t*)v_bot; unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); vbx_set_vl(width); for(l = 0; l < 1; l++){ cell = 1<<l; for(j=0; j < height - 2*cell; j++){ vbx_dma_to_vector(v_in, img+j*width, (1+2*cell)*width*sizeof(unsigned short)); vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+(1*cell)*width); vbx(VVHU, VCUSTOM1, v_bot, v_in+(1*cell)*width, v_in+(2*cell)*width); vbx(SVHBU, VAND, (vbx_ubyte_t*)v_top, 0xf0, v_top); vbx(SVHBU, VAND, (vbx_ubyte_t*)v_bot, 0x0f, v_bot); vbx(VVBU, VADD, v_lbp, v_bot, v_top); vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char)); vbx_sync(); errors += match_array_byte(lbp, scalar_patterns[l]+j*width, "custom_lbp", width-2*cell, 1, 0, max_print_errors, 1, j); if (errors > max_print_errors){ max_print_errors = 0; } } } vbx_sp_free(); vbx_shared_free(lbp); return errors; }
int compare_vbx_lut_to_vbx_lut_ci(int stage, int max_print_errors) { vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); int vci_lanes = this_mxp->vcustom0_lanes; int sz = this_mxp->scratchpad_size/(16*sizeof(vbx_ubyte_t)); vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t)); vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t)); if(v_idx == NULL) { printf("failed to allocate in compare_vbx_lut_to_vbx_lut_ci\n"); } unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char)); int f, n, s, errors = 0; for (n = 0; n < sz; n++) { v_pattern[n] = (n & 0xff); } for (f = 0; f < face_lbp[stage].count; f++) { lbp_feat_t feat = face_lbp[stage].feats[f]; vbx_set_vl(sz); int total = f; s = 0; while(s < stage){ total += face_lbp[s].count; s++; } if(total < 256) { vbx(SVBU, VLBPLUT, v_lutc, total, v_pattern); } else { vbx(SVBS, VLBPLUT, v_lutc, total-256, v_pattern); } vbx(SVB, VMOV, v_pass, feat.fail, 0); /* check if pattern is in lut */ vbx(SVBU, VSHR, v_group, 5, v_pattern); for (n = 0; n < 8; n++) { vbx(SVB, VADD, v_sel, -n, v_group); vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel); } vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern); vbx(VVWB, VSHR, v_lut, v_idx, v_lut); vbx(SVB, VAND, v_lut, 1, v_lut); vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut); vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char)); vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char)); vbx_sync(); errors += match_array_byte(lut, lut_c, "custom_lut", sz, 1, 0, max_print_errors, 0, 0); } vbx_sp_free(); vbx_shared_free(lut); vbx_shared_free(lut_c); return errors; }
int test_lbp_ci(unsigned short* img, int width, int height) { vbx_uhalf_t* v_a1 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b1 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_1h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_a2 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b2 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_2h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_a4 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_b4 = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_uhalf_t* v_4h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t)); vbx_ubyte_t* v_1b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); vbx_ubyte_t* v_2b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); vbx_ubyte_t* v_4b = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t)); unsigned short* lbp1h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned short* lbp2h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned short* lbp4h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short)); unsigned char* lbp1b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); unsigned char* lbp2b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); unsigned char* lbp4b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char)); img = img + width; vbx_dma_to_vector(v_a1, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b1, img + width, width*sizeof(unsigned short)); vbx_dma_to_vector(v_a2, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b2, img + width, width*sizeof(unsigned short)); vbx_dma_to_vector(v_a4, img, width*sizeof(unsigned short)); vbx_dma_to_vector(v_b4, img + width, width*sizeof(unsigned short)); vbx_sync(); int i; int m = 48; for(i=0; i<m; i++){ v_a1[i] = 0; v_b1[i] = 0; v_a2[i] = 0; v_b2[i] = 0; v_a4[i] = 0; v_b4[i] = 0; } int n = 12; int src_a1[] = {0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_b1[] = {0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_a2[] = {0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_b2[] = {0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; int src_a4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0}; int src_b4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0}; for(i=0; i<16; i++){ v_a1[i] = src_a1[i]; v_b1[i] = src_b1[i]; v_a2[i] = src_a2[i]; v_b2[i] = src_b2[i]; v_a4[i] = src_a4[i]; v_b4[i] = src_b4[i]; } vbx_set_vl(width); vbx(VVHU, VCUSTOM1, v_1h, v_a1, v_b1); vbx(VVHU, VCUSTOM2, v_2h, v_a2, v_b2); vbx(VVHU, VCUSTOM3, v_4h, v_a4, v_b4); vbx(VVHB, VADD, v_1b, v_1h, ((vbx_byte_t*)v_1h) + 1); vbx(VVHB, VADD, v_2b, v_2h, ((vbx_byte_t*)v_2h) + 1); vbx(VVHB, VADD, v_4b, v_4h, ((vbx_byte_t*)v_4h) + 1); vbx_dma_to_host(lbp1h, v_1h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp2h, v_2h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp4h, v_4h, width*sizeof(unsigned short)); vbx_dma_to_host(lbp1b, v_1b, width*sizeof(unsigned char)); vbx_dma_to_host(lbp2b, v_2b, width*sizeof(unsigned char)); vbx_dma_to_host(lbp4b, v_4b, width*sizeof(unsigned char)); vbx_sync(); test_print_array_half(v_a1, n); test_print_array_half(v_b1, n); test_print_hex_array_half(lbp1h, n); test_print_hex_array_byte(lbp1b, n); test_print_array_half(v_a2, n); test_print_array_half(v_b2, n); test_print_hex_array_half(lbp2h, n); test_print_hex_array_byte(lbp2b, n); test_print_array_half(v_a4, n); test_print_array_half(v_b4, n); test_print_hex_array_half(lbp4h, n); test_print_hex_array_byte(lbp4b, n); vbx_sp_free(); vbx_shared_free(lbp1h); vbx_shared_free(lbp2h); vbx_shared_free(lbp4h); vbx_shared_free(lbp1b); vbx_shared_free(lbp2b); vbx_shared_free(lbp4b); return 0; }
int VBX_T(vbw_vec_reverse_test)() { unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65, 66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900, 900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536 }; int retval; unsigned int N; unsigned int NBYTES; unsigned int NREPS = 100; unsigned int i,k; vbx_timestamp_t start=0,finish=0; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const unsigned int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; for( i=0; i<sizeof(aN)/4; i++ ) { N = aN[i]; //printf( "testing with vector size %d\n", N ); NBYTES = sizeof(vbx_sp_t)*N; if( 2*NBYTES > VBX_SCRATCHPAD_SIZE ) continue; vbx_sp_t *vsrc = vbx_sp_malloc( NBYTES ); vbx_sp_t *vdst = vbx_sp_malloc( NBYTES ); //printf("bytes alloc: %d\n", NBYTES ); if( !vsrc ) VBX_EXIT(-1); if( !vdst ) VBX_EXIT(-1); #if ( VBX_TEMPLATE_T == BYTESIZE_DEF | VBX_TEMPLATE_T == UBYTESIZE_DEF ) unsigned int mask = 0x007F; #elif ( VBX_TEMPLATE_T == HALFSIZE_DEF | VBX_TEMPLATE_T == UHALFSIZE_DEF ) unsigned int mask = 0x7FFF; #else unsigned int mask = 0xFFFF; #endif vbx_set_vl( N ); vbx( SV(T), VMOV, vdst, -1, 0 ); // Fill the destination vector with -1 vbx( SE(T), VAND, vsrc, mask, 0 ); // Fill the source vector with enumerated values //VBX_T(print_vector)( "vsrcInit", vsrc, N ); //VBX_T(print_vector)( "vdstInit", vdst, N ); /** measure performance of function call **/ vbx_sync(); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { retval = VBX_T(vbw_vec_reverse)( vdst, vsrc, N ); vbx_sync(); } finish = vbx_timestamp(); printf( "length %d (%s):\tvbware sp f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); //VBX_T(print_vector)( "vsrcPost", vsrc, N ); //VBX_T(print_vector)( "vdstPost", vdst, N ); #if VERIFY_VBWARE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\treturn value: %X", retval); vbx_set_vl( N ); vbx( SE(T), VAND, vsrc, mask, 0 ); // Reset the source vector /** measure performance of simple algorithm **/ vbx_sync(); vbx_set_vl( 1 ); vbx_set_2D( N, -sizeof(vbx_sp_t), sizeof(vbx_sp_t), 0 ); start = vbx_timestamp(); for(k=0; k<NREPS; k++ ) { vbx_2D( VV(T), VMOV, vdst+N-1, vsrc, 0 ); vbx_sync(); } finish = vbx_timestamp(); printf( "\tsimple (vl=1):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) ); #if VERIFY_SIMPLE_ALGORITHM VBX_T(verify_vector)( vsrc, vdst, N ); #else printf(" [VERIFY OFF]"); #endif printf("\tcycles\n"); vbx_sp_free(); } vbx_sp_free(); printf("All tests passed successfully.\n"); return 0; }
//vector version of rgb converter void vector_blend( output_pointer img_out, input_pointer img_in1, input_pointer img_in2, unsigned int num_row, unsigned int num_column, intermediate_type blending_const ) { intermediate_type *v_img1[2]; input_type *v_img2[2]; intermediate_type *v_temp; intermediate_type blending_const_bar = 256-blending_const; int j; vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int VBX_WIDTH_BYTES = this_mxp->vector_lanes * sizeof(int); const int VBX_DMA_ALIGNMENT = this_mxp->dma_alignment_bytes; unsigned int chunk_size = VBX_SCRATCHPAD_SIZE/((3*sizeof(intermediate_type))+(2*sizeof(input_type))); chunk_size = VBX_PAD_UP( chunk_size-(VBX_WIDTH_BYTES-1), VBX_DMA_ALIGNMENT ); unsigned int chunk_size_old = chunk_size; unsigned int vector_length = chunk_size; unsigned int vector_length_old = vector_length; v_img1[0] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img1[1] = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); v_img2[0] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_img2[1] = (input_type *)vbx_sp_malloc( chunk_size*sizeof(input_type) ); v_temp = (intermediate_type *)vbx_sp_malloc( chunk_size*sizeof(intermediate_type) ); if( v_temp == NULL ) { VBX_EXIT(0xBADDEAD); } int bufselect = 0; vbx_dma_to_vector( v_img1[bufselect], img_in1, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[bufselect], img_in2, chunk_size*sizeof(input_type) ); for( j=0; j<num_row*num_column; j+=vector_length_old ) { vbx_set_vl(vector_length); if( j > 0 ) { vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size_old*sizeof(output_type) ); } if( (j+vector_length_old) < (num_row*num_column-1) ) { if( (j+vector_length_old*2) >= num_row*num_column ) { vector_length = num_row*num_column - j - vector_length_old; chunk_size = vector_length; } vbx_dma_to_vector( v_img1[1-bufselect], img_in1+j+vector_length_old, chunk_size*sizeof(input_type) ); vbx_dma_to_vector( v_img2[1-bufselect], img_in2+j+vector_length_old, chunk_size*sizeof(input_type) ); } vbx( SVBHU, VMULLO, v_temp, blending_const, v_img1[bufselect] ); vbx( SVBHU, VMULLO, v_img1[bufselect], blending_const_bar, v_img2[bufselect] ); vbx( VVHU, VADD, v_img1[bufselect], v_img1[bufselect], v_temp ); vbx( SVHBU, VSHR, v_img1[bufselect], 8, v_img1[bufselect] ); bufselect = 1-bufselect; } vbx_dma_to_host( img_out+j-vector_length_old, v_img1[1-bufselect], chunk_size*sizeof(output_type) ); vbx_sp_free(); vbx_sync(); }
int vector_motest(pixel *input_buffer, luma_type **last_luma, int *motest_x, int *motest_y, int start_x, int start_y, int reset, const int image_width, const int image_height, const int image_pitch) { int y, x, starty, startx; unsigned int sad, sad_min, y_min, x_min; vbx_uhalf_t *v_search_luma, *v_last_luma; vbx_uhalf_t *v_row_temp; vbx_uword_t *v_row; vbx_uword_t *v_sad; pixel color; if(*last_luma == NULL || reset){ init_vector_motest(input_buffer, last_luma, motest_x, motest_y, start_x, start_y, image_pitch); } v_search_luma = vbx_sp_malloc( MOTEST_BUFFER_SIZE * sizeof(vbx_uhalf_t) ); v_last_luma = vbx_sp_malloc( MOTEST_BLOCK_SIZE * sizeof(vbx_uhalf_t) ); v_row_temp = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uhalf_t) ); v_row = vbx_sp_malloc( MOTEST_BUFFER_WIDTH * sizeof(vbx_uword_t) ); v_sad = vbx_sp_malloc( MOTEST_SEARCH_SIZE * sizeof(vbx_uword_t) ); if(v_sad == NULL){ printf("Not enough scratchpad for motest\n"); while(1); } startx = *motest_x-(MOTEST_SEARCH_WIDTH/2); starty = *motest_y-(MOTEST_SEARCH_HEIGHT/2); if(startx < 0){ startx = 0; } if(startx > image_width-MOTEST_BUFFER_WIDTH){ startx = image_width-MOTEST_BUFFER_WIDTH; } if(starty < 0){ starty = 0; } if(starty > image_height-MOTEST_BUFFER_HEIGHT){ starty = image_height-MOTEST_BUFFER_HEIGHT; } vector_rectangle_to_luma(input_buffer, v_search_luma, v_row_temp, v_row, startx, starty, MOTEST_BUFFER_WIDTH, MOTEST_BUFFER_HEIGHT, image_pitch); vbx_dma_to_vector(v_last_luma, *last_luma, MOTEST_BLOCK_SIZE*sizeof(vbx_uhalf_t)); //Vector compute sad here vbx_set_2D(MOTEST_BLOCK_HEIGHT, sizeof(vbx_uword_t), MOTEST_BUFFER_WIDTH*sizeof(vbx_uhalf_t), MOTEST_BLOCK_WIDTH*sizeof(vbx_uhalf_t)); for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ vbx_set_vl(MOTEST_BLOCK_WIDTH); vbx_acc_2D(VVHWU, VABSDIFF, v_row, v_search_luma+(y*MOTEST_BUFFER_WIDTH)+x, v_last_luma); vbx_set_vl(MOTEST_BLOCK_HEIGHT/2); vbx_acc(VVWU, VADD, v_sad+(y*MOTEST_SEARCH_WIDTH)+x, v_row, v_row+MOTEST_BLOCK_HEIGHT/2); } #if TOUCHSCREEN #ifdef TOUCH_INTERRUPTS_VBX if (touchscreen_get_pen(pTouch)) { vbx_sp_free(); return -1; } #endif #endif } vbx_sync(); sad_min = INT_MAX; y_min = *motest_y; x_min = *motest_x; for(y = 0; y < MOTEST_SEARCH_HEIGHT; y++){ for(x = 0; x < MOTEST_SEARCH_WIDTH; x++){ sad = v_sad[y*MOTEST_SEARCH_WIDTH+x]; if(sad < sad_min){ sad_min = sad; x_min = x+startx; y_min = y+starty; } else if(sad == sad_min) { if( (abs( x - MOTEST_SEARCH_WIDTH/2) + abs( y - MOTEST_SEARCH_HEIGHT/2)) < (abs((x_min-startx) - MOTEST_SEARCH_WIDTH/2) + abs((y_min-starty) - MOTEST_SEARCH_HEIGHT/2))) { x_min = x+startx; y_min = y+starty; } } } } color.r = 0; color.g = 255; color.b = 0; color.a = 0; scalar_draw_line(*motest_x+(MOTEST_BLOCK_WIDTH/2), *motest_y+(MOTEST_BLOCK_HEIGHT/2), x_min+(MOTEST_BLOCK_WIDTH/2), y_min+(MOTEST_BLOCK_HEIGHT/2), color, input_buffer, image_pitch); *motest_y = y_min; *motest_x = x_min; vbx_set_vl(MOTEST_BLOCK_WIDTH); for(y = 0; y < MOTEST_BLOCK_HEIGHT; y++){ vbx(VVHU, VMOV, v_last_luma+(y*MOTEST_BLOCK_WIDTH), v_search_luma+((y+y_min-starty)*MOTEST_BUFFER_WIDTH)+(x_min-startx), 0); } vbx_dma_to_host(*last_luma, v_last_luma, MOTEST_BLOCK_SIZE*sizeof(luma_type)); draw_motest(input_buffer, *motest_x, *motest_y, image_pitch); //simple hack to draw thicker draw_motest(input_buffer, *motest_x+1, *motest_y+1, image_pitch); vbx_sp_free(); return 0; }
int main(void) { vbx_test_init(); vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP(); const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size; const int required_vectors = 4; int N = VBX_PAD_DN(VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t) / required_vectors, this_mxp->scratchpad_alignment_bytes); int PRINT_LENGTH = min( N, MAX_PRINT_LENGTH ); double scalar_time, vector_time; int errors=0; vbx_mxp_print_params(); printf( "\nVector copy test...\n" ); printf( "Vector length: %d\n", N ); vbx_mm_t *scalar_in = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *scalar_out = malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_in = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_mm_t *vector_out = vbx_shared_malloc( N*sizeof(vbx_mm_t) ); vbx_sp_t *v_out = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); vbx_sp_t *v_in = vbx_sp_malloc( N*sizeof(vbx_sp_t) ); VBX_T(test_zero_array)( scalar_in, N ); VBX_T(test_zero_array)( vector_in, N ); VBX_T(test_init_array)( scalar_in, N, 1 ); VBX_T(test_copy_array)( vector_in, scalar_in, N ); scalar_time = test_scalar( scalar_out, scalar_in, N ); VBX_T(test_print_array)( scalar_out, PRINT_LENGTH ); vbx_dma_to_vector( v_in, vector_in, N*sizeof(vbx_sp_t) ); vector_time = test_vector( v_out, v_in, N, scalar_time ); vbx_dma_to_host(vector_out, v_out, N*sizeof(vbx_sp_t) ); vbx_sync(); VBX_T(test_print_array)( vector_out, PRINT_LENGTH ); errors += VBX_T(test_verify_array)( scalar_out, vector_out, N ); vbx_sp_free(); #if TEST_DEEP_SP errors += deep_vector_copy_test(); #endif #if DEBUG_MAKE_SP_FULL vbx_sp_malloc(vbx_sp_getfree()); #endif #if TEST_DEEP_MM errors += deep_vector_copy_ext_test(); #endif VBX_TEST_END(errors); return 0; }