Ejemplo n.º 1
0
void bmm_top(volatile BRAM_DT b1[RAM_DEPTH], volatile BRAM_DT b2[RAM_DEPTH],  volatile BRAM_DT b3[RAM_DEPTH], int blockSize)
{
#pragma HLS INTERFACE ap_bus port=b1
#pragma HLS RESOURCE core=AXI4M variable=b1
#pragma HLS INTERFACE ap_bus port=b2
#pragma HLS RESOURCE core=AXI4M variable=b2
#pragma HLS INTERFACE ap_bus port=b3
#pragma HLS RESOURCE core=AXI4M variable=b3
#pragma HLS RESOURCE core=AXI4LiteS variable=return metadata="-bus_bundle CONTROL"
#pragma HLS INTERFACE ap_hs port=blockSize
#pragma HLS RESOURCE core=AXI4LiteS variable=blockSize metadata="-bus_bundle CONTROL"

	int i = 0,j = 0,k = 0;
	int arow[BDIM], brow[BDIM], crow[BDIM];
#pragma HLS ARRAY_PARTITION variable=arow complete dim=1
#pragma HLS ARRAY_PARTITION variable=brow complete dim=1
#pragma HLS ARRAY_PARTITION variable=crow complete dim=1

	int bsize = blockSize;

    int rowSize = bsize/ELEMS_PER_BUS;              // number of entries per bus
    int numRows = bsize;
    int rowIdx = 0;
	for (rowIdx = 0; rowIdx < numRows; rowIdx++) {   // rowIdx refers to the current bram row in the logical view
        int rowBaseIdx = rowIdx*rowSize;             // rowBaseIdx is the actual index that points to the first element of the row number rowIdx in bram
        k = 0;
        for (j = 0; j < rowSize; j++) {  // j iterates through all the elements in that row, starting from rowIdx
            int curIdx = rowBaseIdx+j;
    		BRAM_DT curElemA = b1[curIdx];
		    BRAM_DT curElemC = b3[curIdx];

    		for (int t2=0; t2<ELEMS_PER_BUS; t2++, k++) {  // Each entry has ELEMS_PER_BUS number of entries, split them and add them to arow and crow
// #pragma HLS UNROLL factor=2
    				arow[k] =  apint_get_range(curElemA, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS); // curElemA & mask; 
		    		crow[k] =  apint_get_range(curElemC, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS); // curElemC & mask; 
            }
/*
   				arow[k] =  apint_get_range(curElemA, 31, 0); // curElemA & mask; 
	    		crow[k] =  apint_get_range(curElemC, 31, 0); // curElemC & mask; 
   				
                arow[k+1] =  apint_get_range(curElemA, 63, 32); // curElemA & mask; 
	    		crow[k+1] =  apint_get_range(curElemC, 63, 32); // curElemC & mask; 

   				arow[k+2] =  apint_get_range(curElemA, 95, 64); // curElemA & mask; 
	    		crow[k+2] =  apint_get_range(curElemC, 95, 64); // curElemC & mask; 

   				arow[k+3] =  apint_get_range(curElemA, 127, 96); // curElemA & mask; 
	    		crow[k+3] =  apint_get_range(curElemC, 127, 96); // curElemC & mask; 

                arow[k+4] =  apint_get_range(curElemA, 159, 128); // curElemA & mask; 
	    		crow[k+4] =  apint_get_range(curElemC, 159, 128); // curElemC & mask; 

   				arow[k+5] =  apint_get_range(curElemA, 191, 160); // curElemA & mask; 
	    		crow[k+5] =  apint_get_range(curElemC, 191, 160); // curElemC & mask; 

   				arow[k+6] =  apint_get_range(curElemA, 223, 192); // curElemA & mask; 
	    		crow[k+6] =  apint_get_range(curElemC, 223, 192); // curElemC & mask; 

   				arow[k+7] =  apint_get_range(curElemA, 255, 224); // curElemA & mask; 
	    		crow[k+7] =  apint_get_range(curElemC, 255, 224); // curElemC & mask; 

                k = k + 8;
*/
	    }

        // Now, iterate through all rows in b2, store them in brow, 
        // to a SIMD multiply-accumulate of arow and brow into crow
        //
        // 1. Iterate through all rows in B
        for (int rowIdxB = 0; rowIdxB < numRows; rowIdxB++) {
            int rowBaseIdxB = rowIdxB * rowSize;
            k = 0;
            // Fetch one row of b2 into brow
            for (j=0; j<rowSize; j++) {
                int curIdx = rowBaseIdxB+j;
                BRAM_DT curElemB = b2[curIdx];

                for (int t2=0; t2<ELEMS_PER_BUS; t2++, k++) {
// #pragma HLS UNROLL factor=2
    				brow[k] =  apint_get_range(curElemB, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS);
                }

/*
    		    brow[k] =  apint_get_range(curElemB, 31, 0);
    		    brow[k+1] =  apint_get_range(curElemB, 63, 32);
    		    brow[k+2] =  apint_get_range(curElemB, 95, 64);
    		    brow[k+3] =  apint_get_range(curElemB, 127, 96);
    		    brow[k+4] =  apint_get_range(curElemB, 159, 128);
    		    brow[k+5] =  apint_get_range(curElemB, 191, 160);
    		    brow[k+6] =  apint_get_range(curElemB, 223, 192);
    		    brow[k+7] =  apint_get_range(curElemB, 255, 224);

                k = k + 8;
*/
            }

            // Multiply-accumulate arow and brow into crow
	        for (int t1=0; t1<bsize; t1++) {
#pragma HLS UNROLL factor=2 skip_exit_check
#pragma HLS PIPELINE
    		    crow[t1] += arow[t1] * brow[t1];   // So that i can verify if rowIdx is correct
    	    }

        }
        
        // Store crow back
        k=0;
        for (j=0; j<rowSize; j++) {
            int curIdx = rowBaseIdx+j;
    		BRAM_DT curElemC = 0;

    		for (int t2=0; t2<ELEMS_PER_BUS; t2++, k++) {
// #pragma HLS UNROLL factor=2
			    curElemC = apint_set_range(curElemC, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS, crow[k]);
    		}

/*
			curElemC = apint_set_range(curElemC, 31, 0, crow[k]);
			curElemC = apint_set_range(curElemC, 63, 32, crow[k+1]);
			curElemC = apint_set_range(curElemC, 95, 64, crow[k+2]);
			curElemC = apint_set_range(curElemC, 127, 96, crow[k+3]);
			curElemC = apint_set_range(curElemC, 159, 128, crow[k+4]);
			curElemC = apint_set_range(curElemC, 191, 160, crow[k+5]);
			curElemC = apint_set_range(curElemC, 223, 192, crow[k+6]);
			curElemC = apint_set_range(curElemC, 255, 224, crow[k+7]);
            k = k + 8;
*/
    		b3[curIdx] = curElemC;
        }
    }
}
Ejemplo n.º 2
0
// Tester
int def259(FILE *source) {
    unsigned char in[CHUNK];
    unsigned char in_res[CHUNK];
    unsigned char out[CHUNK*2]; // In case we have an overflow.
    unsigned char tree[512];
    unsigned in_len, out_len, tree_len, tree_bytes, in_res_len, i;
    int tmp;

    int64_t start, end;
    start = get_time_us();

    tmp = fread(in, 1, CHUNK, source);
    if (tmp < 0) {
        fprintf(stderr, "Failed to read input file.\n");
        return 1;
    }
    in_len = tmp;
    FILE *fp = fopen("sam_tree.bin", "r");
    if (fp == NULL) {
        fprintf(stderr, "Failed to read tree file.\n");
        return 1;
    }
    tree_bytes = fread(tree, 1, 512, fp);
    fprintf(stderr, "Tree bytes: %d\n", tree_bytes);
    // The last item of tree is how many bits are valid within the last non-empty byte.
    // Value is from 0 to 8.
    tree_len = 8*(tree_bytes-2)+tree[tree_bytes-1];
    fprintf(stderr, "Huffman tree loaded, length = %d bits.\n", tree_len);
#ifdef SDACCEL_HOST
    deflate259_opencl(in, in_len, tree, tree_len, out, &out_len);
#else
{
    int j, k;
    uint512 in_hw[CHUNK/64];
    uint512 out_hw[CHUNK*2/64];
    uint512 tree_hw[512/64];
    uint512 tmp512;
    
    for (i = 0; i < CHUNK/64; i++) {
        for (j = 0; j < 64; j++)
            tmp512 = apint_set_range(tmp512, (j+1)*8-1, j*8, in[i*64+j]);
        in_hw[i] = tmp512;
    }

    for (i = 0; i < 512/64; i++) {
        for (j = 0; j < 64; j++)
            tmp512 = apint_set_range(tmp512, (j+1)*8-1, j*8, tree[i*64+j]);
        tree_hw[i] = tmp512;
    }

    deflate259(in_hw, &in_len, tree_hw, &tree_len, out_hw, &out_len);

    for (i = 0; i < CHUNK*2/64; i++) {
        tmp512 = out_hw[i];
        for (j = 0; j < 64; j++)
            out[i*64+j] = apint_get_range(tmp512, (j+1)*8-1, j*8);
    }
}
#endif
    end = get_time_us();

    fprintf(stderr, "Deflate complete, size after compression: %d. Verifying output...\n", out_len);
    FILE *dest = fopen("ex1.sam.def0", "w");
    if ( fwrite(out, 1, out_len, dest) == out_len) {
      fprintf(stderr, "Dumped compressed output successfully.\n");
      fclose(dest);
    }

#if DO_VERIFY==1
    unsigned char out_res[CHUNK*2]; // In case we have an overflow.
    unsigned out_res_len;

    fp = fopen("ex1.sam.gold.def0", "r");
    if (fp == NULL) {
        fprintf(stderr, "Failed to read golden file.\n");
        return 1;
    }
    out_res_len = fread(out_res, 1, CHUNK*2, fp);

    //out_res_len = 21461;
    fprintf(stderr, "Output length: expected %d, got %d.\n", out_res_len, out_len);

    for (i=0; i<((out_res_len>out_len)?out_res_len:out_len); i++) {
      if (out[i] != out_res[i]) {
        fprintf(stderr, "Byte mismatch at position %d: expected %02x, got %02x.", i, out_res[i], out[i]);
        return 1;
      }
    }

/*
    {
        int st = system("diff --brief ex1.sam.gold.def0 ex1.sam.def0");

        if (st != 0) return Z_DATA_ERROR;
    }
*/
#endif

    // Stop the timer and calculate throughput
#define PERFORMANCE_DUMP
#ifdef PERFORMANCE_DUMP
    float elapsed = (float)(end-start)/1000000.0;
    fprintf(stderr, "======================\n");
    fprintf(stderr, "Input size: %d bytes (%f MB)\n", in_len, (float)1.0*in_len/1024/1024);
    fprintf(stderr, "Output size: %d bytes (%f MB)\n", out_len, (float)1.0*out_len/1024/1024);
    fprintf(stderr, "Compression ratio: %f\n", (float)1.0*in_len/out_len);
    fprintf(stderr, "Elapsed time: %f seconds\n", elapsed);
    fprintf(stderr, "Throughput: %f MB/s\n", (float)1.0*in_len/1024/1024/elapsed);
#endif
    return 0;
}