Esempio n. 1
0
File: test.c Progetto: cirqueit/mxp
int compare_vbx_lut_to_vbx_lut_ci(int sz, int max_print_errors)
{
    int f, n, errors;

    vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));

    unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));
    unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));

    for (n = 0; n < sz; n++) {
        v_pattern[n] = n & 0xff;
    }

    int s, stage = 11;
    for (f = 0; f < face_lbp[stage].count; f++) {
        lbp_feat_t feat = face_lbp[stage].feats[f];

        vbx_set_vl(sz);
        int total = f;
        s = 0;
        while(s < stage){
            total += face_lbp[s].count;
            s++;
        }
        vbx(SVBU, VCUSTOM0, v_lutc, total, v_pattern);

        vbx(SVB, VMOV, v_pass, feat.fail, 0);
        /* check if pattern is in lut */
        vbx(SVBU, VSHR, v_group, 5, v_pattern);
        for (n = 0; n < 8; n++) {
            vbx(SVB, VADD, v_sel, -n, v_group);
            vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel);
        }

        vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern);
        vbx(VVWB, VSHR, v_lut, v_idx, v_lut);
        vbx(SVB, VAND, v_lut, 1, v_lut);
        vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut);

        vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char));
        vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char));
        vbx_sync();

        errors = match_array_byte(lut_c, lut, "custom_lut", sz, 1, max_print_errors, 0, 0);

    }
    vbx_sp_free();
    vbx_shared_free(lut);
    vbx_shared_free(lut_c);
    return errors;
}
Esempio n. 2
0
File: test.c Progetto: cirqueit/mxp
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int width, int height, int max_print_errors)
{
    int j, errors = 0;
    unsigned char** scalar_patterns = test_scalar_patterns(img, 0, width, height);

    vbx_ubyte_t* v_in = (vbx_ubyte_t*)vbx_sp_malloc(3*width*sizeof(vbx_word_t));
    vbx_ubyte_t* v_top = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_bot = (vbx_byte_t*)vbx_sp_malloc(width*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lbp = v_bot;

    unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    vbx_set_vl(width);
    for(j=0; j < height - 2; j++){
        vbx_dma_to_vector(v_in, img+j*width, 3*width*sizeof(unsigned char));
        vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+width); 
        vbx(VVHU, VCUSTOM1, v_bot, v_in+width, v_in+2*width); 
        vbx(SVHBU, VAND, v_top, 0xf0, v_top);
        vbx(SVHBU, VAND, v_bot, 0x0f, v_bot);
        vbx(VVBU, VADD, v_lbp, v_bot, v_top); 
        vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char));
        vbx_sync();

        errors = match_array_byte(lbp, scalar_patterns[0]+j*width, "custom_lbp", width-2, 1, max_print_errors, 1, j);

    }
    vbx_sp_free();
    vbx_shared_free(lbp);
    return errors;
}
Esempio n. 3
0
void vbx_mtx_fdct_free( vbx_mtx_fdct_free *v )
{
	vbx_shared_free( v );
	vbx_sp_pop();
	//vbx_sync();  // don't wait for result to be written; let it run in the background
	vbx_sync();  // wait for all results?
}
Esempio n. 4
0
int dma_bandwidth_test()
{
	const int num_iter = 64;

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int scratchpad_size = this_mxp->scratchpad_size;

	uint8_t *buf = vbx_shared_malloc(scratchpad_size);
	vbx_ubyte_t *v_buf = vbx_sp_malloc(scratchpad_size);

	vbx_timestamp_t time_start, time_stop;

	int i;
	int len;
	int to_host;
	int errors = 0;

	vbx_mxp_print_params();

	// dma_alignment_bytes gives DMA master data bus width in bytes.
	double bytes_per_sec = \
		(((double) this_mxp->core_freq) * this_mxp->dma_alignment_bytes);
	double max_megabytes_per_sec = bytes_per_sec/(1024*1024);
	printf("\nMax available bandwidth = %s Megabytes/s\n",
	       vbx_eng(max_megabytes_per_sec, 4));

	printf("\n");

	for (to_host = 0; to_host < 2; to_host++) {
		for (len = 32; len <= scratchpad_size ; len *= 2) {
			printf("DMA %s, %d bytes\n", to_host ? "write" : "read", len);
			vbx_timestamp_start();
			if (to_host) {
				time_start = vbx_timestamp();
				for (i = 0; i < num_iter; i++) {
					vbx_dma_to_host(buf, v_buf, len);
				}
				vbx_sync();
				time_stop = vbx_timestamp();
			} else {
				time_start = vbx_timestamp();
				for (i = 0; i < num_iter; i++) {
					vbx_dma_to_vector(v_buf, buf, len);
				}
				vbx_sync();
				time_stop = vbx_timestamp();
			}
			print_dma_bandwidth(time_start, time_stop, len, num_iter,
			                    max_megabytes_per_sec);
			printf("\n");
		}
		printf("\n");
	}

	vbx_shared_free(buf);
	vbx_sp_free();

	return errors;
}
Esempio n. 5
0
File: test.c Progetto: 8l/mxp
int compare_vbx_lbp_ci_to_scalar_patterns(unsigned short* img, int log, int width, int height, int max_print_errors)
{
    int j, l, cell, max_cell, errors = 0;
    unsigned char** scalar_patterns = test_scalar_patterns(img, log, width, height);

    max_cell = 1<<log;
    vbx_uhalf_t* v_in = (vbx_uhalf_t*)vbx_sp_malloc((1+2*max_cell)*width*sizeof(vbx_half_t));
    vbx_uhalf_t* v_top = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t));
    vbx_uhalf_t* v_bot = (vbx_half_t*)vbx_sp_malloc(width*sizeof(vbx_half_t));
    vbx_ubyte_t* v_lbp = (vbx_ubyte_t*)v_bot;

    unsigned char* lbp = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    vbx_set_vl(width);
    for(l = 0; l < 1; l++){
        cell = 1<<l;
        for(j=0; j < height - 2*cell; j++){
            vbx_dma_to_vector(v_in, img+j*width, (1+2*cell)*width*sizeof(unsigned short));
            vbx(VVHU, VCUSTOM1, v_top, v_in, v_in+(1*cell)*width); 
            vbx(VVHU, VCUSTOM1, v_bot, v_in+(1*cell)*width, v_in+(2*cell)*width); 
            vbx(SVHBU, VAND, (vbx_ubyte_t*)v_top, 0xf0, v_top);
            vbx(SVHBU, VAND, (vbx_ubyte_t*)v_bot, 0x0f, v_bot);
            vbx(VVBU, VADD, v_lbp, v_bot, v_top); 
            vbx_dma_to_host(lbp, v_lbp, width*sizeof(unsigned char));
            vbx_sync();

            errors += match_array_byte(lbp, scalar_patterns[l]+j*width, "custom_lbp", width-2*cell, 1, 0, max_print_errors, 1, j);
            if (errors > max_print_errors){
                max_print_errors = 0;
            }

        }
    }
    vbx_sp_free();
    vbx_shared_free(lbp);
    return errors;
}
Esempio n. 6
0
File: test.c Progetto: 8l/mxp
int compare_vbx_lut_to_vbx_lut_ci(int stage, int max_print_errors)
{
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	int vci_lanes = this_mxp->vcustom0_lanes;
    int sz = this_mxp->scratchpad_size/(16*sizeof(vbx_ubyte_t));

    vbx_byte_t* v_pass = (vbx_byte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_pattern = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lutc = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_group = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_sel = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_byte_t));
    vbx_ubyte_t* v_lut = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    vbx_ubyte_t* v_idx = (vbx_ubyte_t*)vbx_sp_malloc(sz*sizeof(vbx_word_t));
    if(v_idx == NULL) {
        printf("failed to allocate in compare_vbx_lut_to_vbx_lut_ci\n");
    }

    unsigned char* lut = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));
    unsigned char* lut_c = (unsigned char*)vbx_shared_malloc(sz*sizeof(unsigned char));

    int f, n, s, errors = 0;
    for (n = 0; n < sz; n++) {
        v_pattern[n] = (n & 0xff);
    }

    for (f = 0; f < face_lbp[stage].count; f++) {
        lbp_feat_t feat = face_lbp[stage].feats[f];

        vbx_set_vl(sz);
        int total = f;
        s = 0;
        while(s < stage){
            total += face_lbp[s].count;
            s++;
        }

        if(total < 256) {
            vbx(SVBU, VLBPLUT, v_lutc, total, v_pattern);
        } else {
            vbx(SVBS, VLBPLUT, v_lutc, total-256, v_pattern);
        }

        vbx(SVB, VMOV, v_pass, feat.fail, 0);
        /* check if pattern is in lut */
        vbx(SVBU, VSHR, v_group, 5, v_pattern);
        for (n = 0; n < 8; n++) {
            vbx(SVB, VADD, v_sel, -n, v_group);
            vbx(SVBW, VCMV_Z, v_lut, feat.lut[n], v_sel);
        }

        vbx(SVBWU, VAND, v_idx, 0x1f, v_pattern);
        vbx(VVWB, VSHR, v_lut, v_idx, v_lut);
        vbx(SVB, VAND, v_lut, 1, v_lut);
        vbx(SVB, VCMV_LEZ, v_pass, feat.pass, v_lut);

        vbx_dma_to_host(lut_c, v_lutc, sz*sizeof(unsigned char));
        vbx_dma_to_host(lut, v_pass, sz*sizeof(unsigned char));
        vbx_sync();

        errors += match_array_byte(lut, lut_c, "custom_lut", sz, 1, 0, max_print_errors, 0, 0);

    }
    vbx_sp_free();
    vbx_shared_free(lut);
    vbx_shared_free(lut_c);
    return errors;
}
Esempio n. 7
0
File: test.c Progetto: 8l/mxp
int test_lbp_ci(unsigned short* img, int width, int height)
{

    vbx_uhalf_t* v_a1  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b1  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_1h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_uhalf_t* v_a2  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b2  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_2h  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_uhalf_t* v_a4  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_b4  = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));
    vbx_uhalf_t* v_4h = (vbx_uhalf_t*)vbx_sp_malloc(width*sizeof(vbx_uhalf_t));

    vbx_ubyte_t* v_1b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));
    vbx_ubyte_t* v_2b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));
    vbx_ubyte_t* v_4b  = (vbx_ubyte_t*)vbx_sp_malloc(width*sizeof(vbx_ubyte_t));

    unsigned short* lbp1h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));
    unsigned short* lbp2h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));
    unsigned short* lbp4h = (unsigned short*)vbx_shared_malloc(width*sizeof(unsigned short));

    unsigned char* lbp1b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));
    unsigned char* lbp2b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));
    unsigned char* lbp4b = (unsigned char*)vbx_shared_malloc(width*sizeof(unsigned char));

    img = img + width;

    vbx_dma_to_vector(v_a1, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b1, img + width, width*sizeof(unsigned short));
    vbx_dma_to_vector(v_a2, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b2, img + width, width*sizeof(unsigned short));
    vbx_dma_to_vector(v_a4, img,         width*sizeof(unsigned short));
    vbx_dma_to_vector(v_b4, img + width, width*sizeof(unsigned short));
    vbx_sync();

    int i;
    int m = 48;
    for(i=0; i<m; i++){
        v_a1[i] = 0;
        v_b1[i] = 0;
        v_a2[i] = 0;
        v_b2[i] = 0;
        v_a4[i] = 0;
        v_b4[i] = 0;
    }
    int n = 12;
    int src_a1[] = {0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    int src_b1[] = {0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    int src_a2[] = {0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    int src_b2[] = {0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    int src_a4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0};
    int src_b4[] = {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0};
    
    for(i=0; i<16; i++){
        v_a1[i] = src_a1[i];
        v_b1[i] = src_b1[i];
        v_a2[i] = src_a2[i];
        v_b2[i] = src_b2[i];
        v_a4[i] = src_a4[i];
        v_b4[i] = src_b4[i];
    }

    vbx_set_vl(width);
    vbx(VVHU, VCUSTOM1, v_1h, v_a1, v_b1); 
    vbx(VVHU, VCUSTOM2, v_2h, v_a2, v_b2); 
    vbx(VVHU, VCUSTOM3, v_4h, v_a4, v_b4); 
    vbx(VVHB, VADD, v_1b, v_1h, ((vbx_byte_t*)v_1h) + 1);
    vbx(VVHB, VADD, v_2b, v_2h, ((vbx_byte_t*)v_2h) + 1);
    vbx(VVHB, VADD, v_4b, v_4h, ((vbx_byte_t*)v_4h) + 1);
    vbx_dma_to_host(lbp1h, v_1h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp2h, v_2h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp4h, v_4h, width*sizeof(unsigned short));
    vbx_dma_to_host(lbp1b, v_1b, width*sizeof(unsigned char));
    vbx_dma_to_host(lbp2b, v_2b, width*sizeof(unsigned char));
    vbx_dma_to_host(lbp4b, v_4b, width*sizeof(unsigned char));
    vbx_sync();

    test_print_array_half(v_a1, n);
    test_print_array_half(v_b1, n);
    test_print_hex_array_half(lbp1h, n);
    test_print_hex_array_byte(lbp1b, n);

    test_print_array_half(v_a2, n);
    test_print_array_half(v_b2, n);
    test_print_hex_array_half(lbp2h, n);
    test_print_hex_array_byte(lbp2b, n);

    test_print_array_half(v_a4, n);
    test_print_array_half(v_b4, n);
    test_print_hex_array_half(lbp4h, n);
    test_print_hex_array_byte(lbp4b, n);

    vbx_sp_free();
    vbx_shared_free(lbp1h);
    vbx_shared_free(lbp2h);
    vbx_shared_free(lbp4h);
    vbx_shared_free(lbp1b);
    vbx_shared_free(lbp2b);
    vbx_shared_free(lbp4b);
    return 0;
}
Esempio n. 8
0
File: test.c Progetto: 8l/mxp
int VBX_T(vbw_vec_reverse_test_mm)()
{
	unsigned int aN[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 20, 25, 31, 32, 33, 35, 40, 48, 60, 61, 62, 63, 64, 64, 65,
	                      66, 67, 68, 70, 80, 90, 99, 100, 101, 110, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 224, 224,
	                      256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 400, 450, 512, 550, 600, 650, 700, 768, 768, 900,
	                      900, 1023, 1024, 1200, 1400, 1600, 1800, 2048, 2048, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800,
	                      2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4096, 4096, 4100, 4200, 4300,
	                      4400, 4500, 4600, 4700, 4800, 4900, 5000, 6000, 7000, 8000, 8192, 8192, 9000, 10000, 11000, 12000,
	                      13000, 14000, 15000, 16000, 16384, 16384, 20000, 25000, 30000, 32767, 32768, 32768, 35000, 40000,
	                      45000, 50000, 55000, 60000, 65000, 65535, 65536, 65536, 65537, 100000, 128000, 256000, 333333, 528374,
	                      528374 };

	int retval;
	unsigned int N;
	unsigned int NBYTES;
	unsigned int NREPS = 100;
	unsigned int NREPSFORLARGE = 10;
	unsigned int i,j,k;
	vbx_timestamp_t start=0,finish=0;

	for( i=0; i<sizeof(aN)/4; i++ ) {
		N = aN[i];
		//printf( "testing with vector size %d\n", N );

		if(N > 10000) NREPS = NREPSFORLARGE;

		NBYTES = N*sizeof(vbx_mm_t);

		vbx_mm_t *src = (vbx_mm_t *) vbx_shared_malloc( NBYTES );
		vbx_mm_t *dst = (vbx_mm_t *) vbx_shared_malloc( NBYTES );
		//printf("bytes alloc: %d\n", NBYTES );

		if( !src ) VBX_EXIT(-1);
		if( !dst ) VBX_EXIT(-1);

		for ( j=0; j<N; j++ ) {
			dst[j] = -1;                 // Fill the destination with -1
			src[j] = j;                  // Fill the source with enumerated values
		}

//			VBX_T(vbw_vec_reverse_ext)( dst, src, N );

		/** measure performance of function call **/
		start = vbx_timestamp();
		for(k=0; __builtin_expect(k<NREPS,1); k++ ) {
			retval = VBX_T(vbw_vec_reverse_ext)( dst, src, N );
		}
		finish = vbx_timestamp();
		printf( "length %d (%s):\tvbware mm f():\t%llu", N, VBX_EXPAND_AND_QUOTE(BYTEHALFWORD), (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );

		#if VERIFY_VBWARE_ALGORITHM
			VBX_T(verify_vector)( src, dst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif

		printf("\treturn value: %X", retval);

		/** measure performance of scalar **/
		vbx_mm_t *A = vbx_remap_cached( src, N*sizeof(vbx_mm_t) );   // Use cached pointers for better performance
		vbx_mm_t *B = vbx_remap_cached( dst, N*sizeof(vbx_mm_t) );
		start = vbx_timestamp();
		for(k=0; k<NREPS; k++ ) {
			unsigned int m;
			for(m=0; m<N; m++) {
				B[N-1-m]=A[m];
			}
		vbx_dcache_flush( A, N*sizeof(vbx_mm_t) );               // Make sure to read from main memory
		vbx_dcache_flush( B, N*sizeof(vbx_mm_t) );               // Make sure writes are committed to memory
		}
		finish = vbx_timestamp();

		printf( "\tscalar (cache friendly):\t%llu", (unsigned long long) vbx_mxp_cycles((finish-start)/NREPS) );

		#if VERIFY_SIMPLE_ALGORITHM
			VBX_T(verify_vector)( src, dst, N );
		#else
			printf(" [VERIFY OFF]");
		#endif
			printf("\tcycles\n");

			vbx_shared_free(src);
			vbx_shared_free(dst);
	}

	printf("All tests passed successfully.\n");

	return 0;
}
Esempio n. 9
0
int main_tile()
{
	int i, j, k, l, base, block_num;
	int x, y;

	int time_start, time_stop;
	unsigned int cycles;
	double vbx_time, scalar_time;
	int wrong;

	int total_errors = 0;

	//all of the initialization can be hard coded without any computation
	vbx_mtx_fdct_t *v = vbx_mtx_fdct_init( coeff_v, image );
	vbx_timestamp_start();

	printf("\nGenerating initial data...\n");

	dt *image  = (dt *) malloc( IMAGE_WIDTH * IMAGE_HEIGHT * sizeof(dt) );
	GenerateRandomImage( image, IMAGE_WIDTH, IMAGE_HEIGHT, 0/*seed*/ );

	// Allocate memory to store results.
	// Results are computed BIGTILE_SIZE halfwords at a time.
	const int BIGTILE_SIZE = NUM_TILE_X * NUM_TILE_Y * DCT_SIZE;
	dt *block_s =                   malloc( BIGTILE_SIZE * sizeof(dt) );
	dt *block_v = (dt *) vbx_shared_malloc( BIGTILE_SIZE * sizeof(dt) );
	dt *coeff_v = (dt *) vbx_shared_malloc( BIGTILE_SIZE * sizeof(dt) );

	//Make an uncached 1D version of the coeff matrix
	for (i = 0; i < NUM_TILE_Y; i++) {             // row
		for (j = 0; j < BLOCK_SIZE; j++) {         // row
			for (k = 0; k < NUM_TILE_X; k++) {     // col
				for (l = 0; l < BLOCK_SIZE; l++) { // col
					coeff_v[i*NUM_TILE_X*DCT_SIZE + j*DCT_SIZE + k*BLOCK_SIZE + l] = cs[j][l];
				}
			}
		}
	}

#ifdef DEBUG
	printf("input matrix is:\n");
	for (i = 0; i < BLOCK_SIZE; i++) {
		base = i * BLOCK_SIZE;
		for (j = 0; j < BLOCK_SIZE; j++) {
			printf("%d ", (int) block_s[base + j]);
		}
		printf("\n");
	}
#endif

	printf("\nRunning DCT...\n");

	time_start = vbx_timestamp();
	for( y = 0; y < IMG_DOWN; y++ ) {
		for( x = 0; x < IMG_ACROSS; x++ ) {
			vbx_mtx_fdct_scalar( block_s, (dt*)cs, image, x/*start_x*/, y/*start_y*/, NUM_TILE_X, NUM_TILE_Y );
		}
	}
	time_stop = vbx_timestamp();

	cycles = time_stop - time_start;
	scalar_time = (double) cycles;
	scalar_time /= (double) vbx_timestamp_freq();
	scalar_time *= 1000.0;		//ms
	vbx_timestamp_t mxp_cycles = vbx_mxp_cycles(cycles);

	printf("%dx%d Block Size\n", BLOCK_SIZE, BLOCK_SIZE);
	printf("Finished, scalar CPU took %0.3f ms \n", scalar_time);
	printf(" CPU Cycles: %d\n", (int) mxp_cycles);
	printf(" CPU Cycles per block: %f\n", mxp_cycles / ((double) (NUM_BLOCKS)));

	vbx_sync(); // wait for image to be prefetched

	time_start = vbx_timestamp();
	for( y = 0; y < IMG_DOWN; y++ ) {
		for( x = 0; x < IMG_ACROSS; x++ ) {
			vbx_mtx_fdct( v, block_v, image, x/*start_x*/, y/*start_y*/, IMG_ACROSS-1,IMG_DOWN-1,NUM_TILE_X, NUM_TILE_Y );
		}
	}
	time_stop = vbx_timestamp();

	cycles = time_stop - time_start;
	vbx_time = (double) cycles;
	vbx_time /= (double) vbx_timestamp_freq();
	vbx_time *= 1000.0;			//ms
	mxp_cycles = vbx_mxp_cycles(cycles);

	printf("Finished, MXP took %0.3f ms \n", vbx_time);
	printf(" CPU Cycles: %d\n", (int) mxp_cycles);
	printf(" CPU Cycles per block: %f\n", mxp_cycles / ((double) (NUM_BLOCKS)));
	printf(" Speedup: %f\n", scalar_time / vbx_time);

	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	double vbx_mbps = (double) (NUM_BLOCKS) * 1000 / vbx_time;	// blocks per second
	printf("V%d@%dMHz: %dx%d tile, %dx%d blocks, %f blocks/s, %f megapixel/s\n",
	       this_mxp->vector_lanes, this_mxp->core_freq / 1000000, 
	       NUM_TILE_Y, NUM_TILE_X, 
	       BLOCK_SIZE, BLOCK_SIZE,
	       vbx_mbps, (vbx_mbps * DCT_SIZE) / 1000000);

	printf("\nChecking results...\n");

	wrong = 0;
	for (block_num = 0; block_num < NUM_BLOCKS; block_num++) {
		for (i = 0; i < BLOCK_SIZE; i++) {
			base = i * BLOCK_SIZE;
			for (j = 0; j < BLOCK_SIZE; j++) {
				if (block_s[block_num * DCT_SIZE + base + j] != block_v[block_num * DCT_SIZE + base + j]) {
					if (wrong < 5) {
						printf("\nError at %d [%d,%d], result is %d, should be %d\n",
							   block_num, i, j, (int) block_v[block_num * DCT_SIZE + base + j],
							   (int) block_s[block_num * DCT_SIZE + base + j]);
					}
					wrong++;
				}
			}
		}
	}

	printf("wrong is %d\n\n", wrong);
	total_errors += wrong;

	free(block_s);
	vbx_shared_free(block_v);
	vbx_shared_free(coeff_v);

	vbx_mtx_fdct_free( v );

	VBX_TEST_END(total_errors);

	return (0);
}
Esempio n. 10
0
int main(void)
{
	vbx_test_init();
	typedef vbx_word_t vbx_mm_t;
	vbx_mxp_t *this_mxp = VBX_GET_THIS_MXP();
	const int VBX_SCRATCHPAD_SIZE = this_mxp->scratchpad_size;
	int N = VBX_SCRATCHPAD_SIZE / sizeof(vbx_mm_t );
	N = 20;
	int M = 20;

	int PRINT_LENGTH =  N<MAX_PRINT_LENGTH ? N : MAX_PRINT_LENGTH ;
	//	int PRINT_ROWS = PRINT_LENGTH;
	int PRINT_ROWS = M<MAX_PRINT_LENGTH ? N : MAX_PRINT_LENGTH;
	int PRINT_COLS = PRINT_LENGTH;

	double scalar_time, vector_time,vector2_time;
	int errors=0;

	vbx_mxp_print_params();
	printf( "\nMatrix multiply test...\n" );
	printf( "Matrix dimensions: %d,%d\n", N, M );


	vbx_mm_t  *scalar_in1 = (vbx_mm_t*)malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *scalar_in2 = (vbx_mm_t*)malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *scalar_out = (vbx_mm_t*)malloc( N*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *vector_in1 = (vbx_mm_t*)vbx_shared_malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *vector_in2 = (vbx_mm_t*)vbx_shared_malloc( M*N*sizeof(vbx_mm_t ) );
	vbx_mm_t  *vector_out = (vbx_mm_t*)vbx_shared_malloc( N*N*sizeof(vbx_mm_t ) );
	if ( scalar_in1 == NULL ||
	     scalar_in2 == NULL ||
	     scalar_out == NULL ||
	     vector_in1 == NULL ||
	     vector_in2 == NULL ||
	     vector_out == NULL ){
		printf("Malloc failed\n");
		VBX_TEST_END(1);
		return 0;
	}



	test_zero_array_word(scalar_out, N*N );
	test_zero_array_word(vector_out, N*N );

	test_init_array_word( scalar_in1, M*N, 1 );
	test_copy_array_word( vector_in1, scalar_in1, M*N );
	test_init_array_word( scalar_in2, M*N, 999 );
	//scalar_mtx_xp_MN_word( vector_in2, scalar_in2, N, N );
	test_copy_array_word( vector_in2, scalar_in2, M*N );

	test_print_matrix_word( scalar_in1, PRINT_COLS, PRINT_ROWS, M );
	test_print_matrix_word( scalar_in2, PRINT_ROWS, PRINT_COLS, N );

	//change print sizes for outputs
	PRINT_ROWS=PRINT_COLS=N<PRINT_LENGTH?N:PRINT_LENGTH;

	scalar_time = test_scalar( scalar_out, scalar_in1, N, M, scalar_in2, M, N);
	test_print_matrix_word( scalar_out, PRINT_COLS, PRINT_ROWS, N );


	vector_time = test_vector( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time );
	test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N );
	errors += test_verify_array_word( scalar_out, vector_out, N*N);

	vector2_time = test_vector_trans( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time );
	test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N );
	errors += test_verify_array_word( scalar_out, vector_out, N*N);

	vector2_time = test_vector_sp( vector_out, vector_in1, N, M, vector_in2, M, N, scalar_time );
	test_print_matrix_word( vector_out, PRINT_COLS, PRINT_ROWS, N );
	errors += test_verify_array_word( scalar_out, vector_out, N*N);

	vbx_shared_free(vector_out);
	vbx_shared_free(vector_in2);
	vbx_shared_free(vector_in1);
	free(scalar_out);
	free(scalar_in2);
	free(scalar_in1);

	//errors += orig_test();

	VBX_TEST_END(errors);
	return 0;
}