void bmm_top(volatile BRAM_DT b1[RAM_DEPTH], volatile BRAM_DT b2[RAM_DEPTH], volatile BRAM_DT b3[RAM_DEPTH], int blockSize) { #pragma HLS INTERFACE ap_bus port=b1 #pragma HLS RESOURCE core=AXI4M variable=b1 #pragma HLS INTERFACE ap_bus port=b2 #pragma HLS RESOURCE core=AXI4M variable=b2 #pragma HLS INTERFACE ap_bus port=b3 #pragma HLS RESOURCE core=AXI4M variable=b3 #pragma HLS RESOURCE core=AXI4LiteS variable=return metadata="-bus_bundle CONTROL" #pragma HLS INTERFACE ap_hs port=blockSize #pragma HLS RESOURCE core=AXI4LiteS variable=blockSize metadata="-bus_bundle CONTROL" int i = 0,j = 0,k = 0; int arow[BDIM], brow[BDIM], crow[BDIM]; #pragma HLS ARRAY_PARTITION variable=arow complete dim=1 #pragma HLS ARRAY_PARTITION variable=brow complete dim=1 #pragma HLS ARRAY_PARTITION variable=crow complete dim=1 int bsize = blockSize; int rowSize = bsize/ELEMS_PER_BUS; // number of entries per bus int numRows = bsize; int rowIdx = 0; for (rowIdx = 0; rowIdx < numRows; rowIdx++) { // rowIdx refers to the current bram row in the logical view int rowBaseIdx = rowIdx*rowSize; // rowBaseIdx is the actual index that points to the first element of the row number rowIdx in bram k = 0; for (j = 0; j < rowSize; j++) { // j iterates through all the elements in that row, starting from rowIdx int curIdx = rowBaseIdx+j; BRAM_DT curElemA = b1[curIdx]; BRAM_DT curElemC = b3[curIdx]; for (int t2=0; t2<ELEMS_PER_BUS; t2++, k++) { // Each entry has ELEMS_PER_BUS number of entries, split them and add them to arow and crow // #pragma HLS UNROLL factor=2 arow[k] = apint_get_range(curElemA, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS); // curElemA & mask; crow[k] = apint_get_range(curElemC, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS); // curElemC & mask; } /* arow[k] = apint_get_range(curElemA, 31, 0); // curElemA & mask; crow[k] = apint_get_range(curElemC, 31, 0); // curElemC & mask; arow[k+1] = apint_get_range(curElemA, 63, 32); // curElemA & mask; crow[k+1] = apint_get_range(curElemC, 63, 32); // curElemC & mask; arow[k+2] = apint_get_range(curElemA, 95, 64); // curElemA & mask; crow[k+2] = apint_get_range(curElemC, 95, 64); // curElemC & mask; arow[k+3] = apint_get_range(curElemA, 127, 96); // curElemA & mask; crow[k+3] = apint_get_range(curElemC, 127, 96); // curElemC & mask; arow[k+4] = apint_get_range(curElemA, 159, 128); // curElemA & mask; crow[k+4] = apint_get_range(curElemC, 159, 128); // curElemC & mask; arow[k+5] = apint_get_range(curElemA, 191, 160); // curElemA & mask; crow[k+5] = apint_get_range(curElemC, 191, 160); // curElemC & mask; arow[k+6] = apint_get_range(curElemA, 223, 192); // curElemA & mask; crow[k+6] = apint_get_range(curElemC, 223, 192); // curElemC & mask; arow[k+7] = apint_get_range(curElemA, 255, 224); // curElemA & mask; crow[k+7] = apint_get_range(curElemC, 255, 224); // curElemC & mask; k = k + 8; */ } // Now, iterate through all rows in b2, store them in brow, // to a SIMD multiply-accumulate of arow and brow into crow // // 1. Iterate through all rows in B for (int rowIdxB = 0; rowIdxB < numRows; rowIdxB++) { int rowBaseIdxB = rowIdxB * rowSize; k = 0; // Fetch one row of b2 into brow for (j=0; j<rowSize; j++) { int curIdx = rowBaseIdxB+j; BRAM_DT curElemB = b2[curIdx]; for (int t2=0; t2<ELEMS_PER_BUS; t2++, k++) { // #pragma HLS UNROLL factor=2 brow[k] = apint_get_range(curElemB, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS); } /* brow[k] = apint_get_range(curElemB, 31, 0); brow[k+1] = apint_get_range(curElemB, 63, 32); brow[k+2] = apint_get_range(curElemB, 95, 64); brow[k+3] = apint_get_range(curElemB, 127, 96); brow[k+4] = apint_get_range(curElemB, 159, 128); brow[k+5] = apint_get_range(curElemB, 191, 160); brow[k+6] = apint_get_range(curElemB, 223, 192); brow[k+7] = apint_get_range(curElemB, 255, 224); k = k + 8; */ } // Multiply-accumulate arow and brow into crow for (int t1=0; t1<bsize; t1++) { #pragma HLS UNROLL factor=2 skip_exit_check #pragma HLS PIPELINE crow[t1] += arow[t1] * brow[t1]; // So that i can verify if rowIdx is correct } } // Store crow back k=0; for (j=0; j<rowSize; j++) { int curIdx = rowBaseIdx+j; BRAM_DT curElemC = 0; for (int t2=0; t2<ELEMS_PER_BUS; t2++, k++) { // #pragma HLS UNROLL factor=2 curElemC = apint_set_range(curElemC, t2*ELEM_WIDTH_BITS + ELEM_WIDTH_BITS-1, t2*ELEM_WIDTH_BITS, crow[k]); } /* curElemC = apint_set_range(curElemC, 31, 0, crow[k]); curElemC = apint_set_range(curElemC, 63, 32, crow[k+1]); curElemC = apint_set_range(curElemC, 95, 64, crow[k+2]); curElemC = apint_set_range(curElemC, 127, 96, crow[k+3]); curElemC = apint_set_range(curElemC, 159, 128, crow[k+4]); curElemC = apint_set_range(curElemC, 191, 160, crow[k+5]); curElemC = apint_set_range(curElemC, 223, 192, crow[k+6]); curElemC = apint_set_range(curElemC, 255, 224, crow[k+7]); k = k + 8; */ b3[curIdx] = curElemC; } } }
// Tester int def259(FILE *source) { unsigned char in[CHUNK]; unsigned char in_res[CHUNK]; unsigned char out[CHUNK*2]; // In case we have an overflow. unsigned char tree[512]; unsigned in_len, out_len, tree_len, tree_bytes, in_res_len, i; int tmp; int64_t start, end; start = get_time_us(); tmp = fread(in, 1, CHUNK, source); if (tmp < 0) { fprintf(stderr, "Failed to read input file.\n"); return 1; } in_len = tmp; FILE *fp = fopen("sam_tree.bin", "r"); if (fp == NULL) { fprintf(stderr, "Failed to read tree file.\n"); return 1; } tree_bytes = fread(tree, 1, 512, fp); fprintf(stderr, "Tree bytes: %d\n", tree_bytes); // The last item of tree is how many bits are valid within the last non-empty byte. // Value is from 0 to 8. tree_len = 8*(tree_bytes-2)+tree[tree_bytes-1]; fprintf(stderr, "Huffman tree loaded, length = %d bits.\n", tree_len); #ifdef SDACCEL_HOST deflate259_opencl(in, in_len, tree, tree_len, out, &out_len); #else { int j, k; uint512 in_hw[CHUNK/64]; uint512 out_hw[CHUNK*2/64]; uint512 tree_hw[512/64]; uint512 tmp512; for (i = 0; i < CHUNK/64; i++) { for (j = 0; j < 64; j++) tmp512 = apint_set_range(tmp512, (j+1)*8-1, j*8, in[i*64+j]); in_hw[i] = tmp512; } for (i = 0; i < 512/64; i++) { for (j = 0; j < 64; j++) tmp512 = apint_set_range(tmp512, (j+1)*8-1, j*8, tree[i*64+j]); tree_hw[i] = tmp512; } deflate259(in_hw, &in_len, tree_hw, &tree_len, out_hw, &out_len); for (i = 0; i < CHUNK*2/64; i++) { tmp512 = out_hw[i]; for (j = 0; j < 64; j++) out[i*64+j] = apint_get_range(tmp512, (j+1)*8-1, j*8); } } #endif end = get_time_us(); fprintf(stderr, "Deflate complete, size after compression: %d. Verifying output...\n", out_len); FILE *dest = fopen("ex1.sam.def0", "w"); if ( fwrite(out, 1, out_len, dest) == out_len) { fprintf(stderr, "Dumped compressed output successfully.\n"); fclose(dest); } #if DO_VERIFY==1 unsigned char out_res[CHUNK*2]; // In case we have an overflow. unsigned out_res_len; fp = fopen("ex1.sam.gold.def0", "r"); if (fp == NULL) { fprintf(stderr, "Failed to read golden file.\n"); return 1; } out_res_len = fread(out_res, 1, CHUNK*2, fp); //out_res_len = 21461; fprintf(stderr, "Output length: expected %d, got %d.\n", out_res_len, out_len); for (i=0; i<((out_res_len>out_len)?out_res_len:out_len); i++) { if (out[i] != out_res[i]) { fprintf(stderr, "Byte mismatch at position %d: expected %02x, got %02x.", i, out_res[i], out[i]); return 1; } } /* { int st = system("diff --brief ex1.sam.gold.def0 ex1.sam.def0"); if (st != 0) return Z_DATA_ERROR; } */ #endif // Stop the timer and calculate throughput #define PERFORMANCE_DUMP #ifdef PERFORMANCE_DUMP float elapsed = (float)(end-start)/1000000.0; fprintf(stderr, "======================\n"); fprintf(stderr, "Input size: %d bytes (%f MB)\n", in_len, (float)1.0*in_len/1024/1024); fprintf(stderr, "Output size: %d bytes (%f MB)\n", out_len, (float)1.0*out_len/1024/1024); fprintf(stderr, "Compression ratio: %f\n", (float)1.0*in_len/out_len); fprintf(stderr, "Elapsed time: %f seconds\n", elapsed); fprintf(stderr, "Throughput: %f MB/s\n", (float)1.0*in_len/1024/1024/elapsed); #endif return 0; }