int main(int argc, char **argv) { uint32_t rows, cols; uint32_t *a1, *a3, *e3; int const radius = STENCIL_RADIUS; // check command line args if (argc == 1) { rows = 100; // default rows cols = 100; // default cols } else if (argc == 3) { rows = atoi(argv[1]); if (rows <= 0 || rows > 1023) { usage(argv[1]); return 0; } cols = atoi(argv[2]); if (cols <= 0 || cols > 1023) { usage(argv[2]); return 0; } } else { usage(argv[0]); return 0; } printf("Running with rows = %u, cols = %u\n", rows, cols); printf("Initializing arrays\n"); fflush(stdout); a1 = (uint32_t *)malloc((rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t)); a3 = (uint32_t *)malloc((rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t)); memset(a1, 0, (rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t)); memset(a3, 0, (rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t)); for (uint32_t r = 0; r < rows; r++) for (uint32_t c = 0; c < cols; c++) { a1[(r+radius)*(cols + radius * 2)+(c+radius)] = ((r+radius) << 8) | (c+radius); } CHtHif *pHtHif = new CHtHif(); int unitCnt = pHtHif->GetUnitCnt(); printf("#AUs = %d\n", unitCnt); CHtAuUnit ** pAuUnits = new CHtAuUnit * [unitCnt]; for (int unit = 0; unit < unitCnt; unit++) pAuUnits[unit] = new CHtAuUnit(pHtHif); // Coprocessor memory arrays uint32_t *cp_a1 = (uint32_t*)pHtHif->MemAlloc((rows+radius*2) * (cols+radius*2) * sizeof(uint32_t)); uint32_t *cp_a3 = (uint32_t*)pHtHif->MemAlloc((rows+radius*2) * (cols+radius*2) * sizeof(uint32_t)); if (!cp_a1 || !cp_a3) { fprintf(stderr, "ht_cp_malloc() failed.\n"); exit(-1); } pHtHif->MemCpy(cp_a1, a1, (rows+radius*2) * (cols+radius*2) * sizeof(uint32_t)); pHtHif->MemSet(cp_a3, 0, (rows+radius*2) * (cols+radius*2) * sizeof(uint32_t)); // avoid bank aliasing for performance if (unitCnt > 16 && !(unitCnt & 1)) unitCnt -= 1; printf("stride = %d\n", unitCnt); fflush(stdout); // Send calls to units uint32_t rowsPerUnit = (cols + unitCnt - 1) / unitCnt; for (int unit = 0; unit < unitCnt; unit++) { uint32_t * pSrcAddr = cp_a1 + unit * (cols+radius*2) * rowsPerUnit; uint32_t * pDstAddr = cp_a3 + unit * (cols+radius*2) * rowsPerUnit; uint32_t unitRows = (unit+1)*rowsPerUnit > rows ? (rows - unit*rowsPerUnit) : rowsPerUnit; pAuUnits[unit]->SendCall_htmain((uint64_t)pSrcAddr, (uint64_t)pDstAddr, unitRows, cols); } // generate expected results while waiting for returns e3 = (uint32_t *)malloc((rows+radius*2) * (cols+radius*2) * sizeof(uint32_t)); memset(e3, 0, (rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t)); uint32_t coef[5] = { STENCIL_COEF2, STENCIL_COEF1, STENCIL_COEF0/2, STENCIL_COEF1, STENCIL_COEF2 }; for (uint32_t row = radius; row < rows+radius; row++) { for (uint32_t col = radius; col < cols+radius; col++) { uint32_t rslt = 0; for (uint32_t c = col - STENCIL_RADIUS; c <= col + STENCIL_RADIUS; c++) rslt += a1[row*(cols+radius*2) + c] * coef[c - col + STENCIL_RADIUS]; for (uint32_t r = row - STENCIL_RADIUS; r <= row + STENCIL_RADIUS; r++) rslt += a1[r*(cols+radius*2) + col] * coef[r - row + STENCIL_RADIUS]; e3[row*(cols+radius*2) + col] = rslt >> 8; } } // Wait for returns for (int unit = 0; unit < unitCnt; unit++) { while (!pAuUnits[unit]->RecvReturn_htmain()) usleep(1000); } pHtHif->MemCpy(a3, cp_a3, (rows+radius*2) * (cols+radius*2) * sizeof(uint32_t)); // check results int err_cnt = 0; for (uint32_t col = 0; col < (cols+radius*2); col++) { for (uint32_t row = 0; row < (rows+radius*2); row++) { if (a3[row*(cols+radius*2) + col] != e3[row*(cols+radius*2) + col]) { printf("a3[row=%u, col=%u] is %u, should be %u\n", row, col, a3[row*(cols+radius*2) + col], e3[row*(cols+radius*2) + col]); err_cnt++; } } } if (err_cnt) printf("FAILED: detected %d issues!\n", err_cnt); else printf("PASSED\n"); // free memory free(a1); free(a3); pHtHif->MemFree(cp_a1); pHtHif->MemFree(cp_a3); delete pHtHif; return err_cnt; }
int main(int argc, char **argv) { int debug = 0; int mat_int_len, temp; #ifdef HT_MODEL default_size = 15; #endif #ifdef HT_SYSC default_size = 15; #endif #ifdef HT_VSIM default_size = 3; #endif uint64_t i, j, k; uint64_t aRow, aCol, bRow, bCol; uint64_t *a1, *a2, *a3; // check command line args if (argc <= 2) { // Defaults aRow = default_size; aCol = default_size; bRow = default_size; bCol = default_size; if (argc==2) { if (atoi(argv[1]) == 1 || atoi(argv[1]) == 0) { debug = atoi(argv[1]); } else { usage(argv[0]); } debug = atoi(argv[1])==1 ? 1 : 0; } } else if (argc == 5 || argc == 6) { // Grab Command Line Values aRow = atoi(argv[1]); aCol = atoi(argv[2]); bRow = atoi(argv[3]); bCol = atoi(argv[4]); if (argc==6) debug = atoi(argv[5])==1 ? 1 : 0; if (aRow <= 0 || aCol <= 0 || bRow <= 0 || bCol <= 0) { usage(argv[0]); return 0; } } else { usage(argv[0]); return 0; } if (aCol != bRow) { printf("ERROR: Number of columns in Matrix A (%lld) does not equal the number of rows in Matrix B (%lld)\n\n", (long long)aCol, (long long)bRow); exit(1); } printf("Running with: Matrix A: (%lldx%lld) and Matrix B (%lldx%lld)\n", (long long)aRow, (long long)aCol, (long long)bRow, (long long)bCol); printf("Initializing arrays\n"); fflush(stdout); a1 = (uint64_t *)malloc(aRow * aCol * 8); a2 = (uint64_t *)malloc(bRow * bCol * 8); a3 = (uint64_t *)malloc(aRow * bCol * 8); memset(a3, 0, aRow * bCol * 8); /* * Matrix A and B are stored differently in memory (to make reads more efficient?) * The following are 3x3 examples: (A counts from 8 -> 0, B counts from 0 -> 8) * * Actual Numbers: Memory Locations (by Index) * * A: 8 7 6 A: 0 3 6 * 5 4 5 1 4 7 * 2 1 0 2 5 8 * * B: 0 1 2 B: 0 1 2 * 3 4 5 3 4 5 * 6 7 8 6 7 8 * * I did this to try to make A Rows available on strides, as well as B Columns on strides... * */ // Fill Matrix A k = 0; for (i = 0; i < aCol; i++) { for (j = aRow; j > 0; j--) { a1[k] = (aRow*aCol)-aCol*(aRow-j)-i-1; k++; } } // Fill Matrix B for (i = 0; i < bRow*bCol; i++) { a2[i] = i; } //Print Matrices mat_int_len = 1; temp = 0; for(i = 0; i < aRow*bCol; i++) { temp = num_length(a1[i]); mat_int_len = (temp > mat_int_len) ? temp : mat_int_len; } printf("Matrix A:\n"); for (i = 0; i < aRow; i++) { for (j = 0; j < aCol; j++) { printf("%*lld ", mat_int_len, (long long)a1[i+aRow*j]); } printf("\n"); } printf("\n\n"); mat_int_len = 1; temp = 0; for(i = 0; i < aRow*bCol; i++) { temp = num_length(a1[i]); mat_int_len = (temp > mat_int_len) ? temp : mat_int_len; } printf("Matrix B:\n"); for(i = 0; i < bRow*bCol; i++) { if (i > 0) { if (i%bCol == 0) { printf("\n"); } } printf("%*lld ", mat_int_len, (long long)a2[i]); } printf("\n\n"); // Debug - Print Matrix Values at memory locations if (debug) { printf("A - MEM\n"); for (i = 0; i < aRow*aCol; i++) { printf("%lld - %lld\n", (long long)i, (long long)a1[i]); } printf("B - MEM\n"); for (i = 0; i < bRow*bCol; i++) { printf("%lld - %lld\n", (long long)i, (long long)a2[i]); } } CHtHif *pHtHif = new CHtHif(); // Coprocessor memory arrays uint64_t *cp_a1 = (uint64_t *)pHtHif->MemAllocAlign(4 * 1024, aRow * aCol * sizeof(uint64_t)); uint64_t *cp_a2 = (uint64_t *)pHtHif->MemAllocAlign(4 * 1024, bRow * bCol * sizeof(uint64_t)); uint64_t *cp_a3 = (uint64_t *)pHtHif->MemAllocAlign(4 * 1024, aRow * bCol * sizeof(uint64_t)); pHtHif->MemCpy(cp_a1, a1, aRow * aCol * sizeof(uint64_t)); pHtHif->MemCpy(cp_a2, a2, bRow * bCol * sizeof(uint64_t)); pHtHif->MemSet(cp_a3, 0, aRow * bCol * sizeof(uint64_t)); int unitCnt = pHtHif->GetUnitCnt(); CHtAuUnit ** pAuUnits = new CHtAuUnit * [unitCnt]; for (int unitId = 0; unitId < unitCnt; unitId += 1) pAuUnits[unitId] = new CHtAuUnit(pHtHif); printf("#AUs = %d\n", unitCnt); // avoid bank aliasing for performance if (unitCnt > 16 && !(unitCnt & 1)) unitCnt -= 1; printf("stride = %d\n", unitCnt); fflush(stdout); pHtHif->SendAllHostMsg(MA_BASE, (uint64_t)cp_a1); pHtHif->SendAllHostMsg(MB_BASE, (uint64_t)cp_a2); pHtHif->SendAllHostMsg(MC_BASE, (uint64_t)cp_a3); pHtHif->SendAllHostMsg(MC_ROW, (uint32_t)aRow); pHtHif->SendAllHostMsg(MC_COL, (uint32_t)bCol); pHtHif->SendAllHostMsg(COMMON, (uint32_t)aCol); for (int unit = 0; unit < unitCnt; unit++) pAuUnits[unit]->SendCall_htmain(unit /*rowOffset*/, unitCnt /*stride*/); for (int unit = 0; unit < unitCnt; unit++) { while (!pAuUnits[unit]->RecvReturn_htmain()) usleep(1000); fflush(stdout); } pHtHif->MemCpy(a3, cp_a3, aRow * bCol * sizeof(uint64_t)); // Print Resulting Matrix mat_int_len = 1; temp = 0; for(i = 0; i < aRow*bCol; i++) { temp = num_length(a3[i]); mat_int_len = (temp > mat_int_len) ? temp : mat_int_len; } printf("\nMatrix C:\n"); for(i = 0; i < aRow*bCol; i++) { if (i > 0) { if (i%bCol == 0) { printf("\n"); } } printf("%*lld ", mat_int_len, (long long)a3[i]); } printf("\n\n"); if (debug) { printf("C - MEM\n"); for (i = 0; i < aRow*bCol; i++) { printf("%lld - %lld\n", (long long)i, (long long)a3[i]); } } // Do error checking int err_cnt = 0; uint64_t rowNum = 0, colNum = 0, calcNum = 0, eleNum = 0; uint64_t *val; val = (uint64_t *)malloc(aRow * bCol * 8); memset(val, 0, aRow * bCol * 8); // Calculate the resulting matrix to check against coprocessor results for (rowNum = 0; rowNum < aRow; rowNum++) { for (colNum = 0; colNum < bCol; colNum++) { for (calcNum = 0; calcNum < aCol; calcNum++) { val[eleNum] += a1[rowNum+(calcNum*aRow)] * a2[colNum+(calcNum*bCol)]; } eleNum++; } } // Check results for (eleNum = 0; eleNum < aRow*bCol; eleNum++) { if (val[eleNum] != a3[eleNum]) { err_cnt++; printf("Found element mismatch at matrix position %lld - found value %lld, expected value %lld.\n", (unsigned long long)eleNum, (unsigned long long)a3[eleNum], (unsigned long long)val[eleNum]); } } if (err_cnt == 0) { // Test Passed printf("PASSED\n\n"); } else { // Test Failed printf("FAILED - error count %d\n\n", err_cnt); } // free memory free(a1); free(a2); free(a3); pHtHif->MemFreeAlign(cp_a1); pHtHif->MemFreeAlign(cp_a2); pHtHif->MemFreeAlign(cp_a3); free(val); delete pHtHif; return err_cnt; }
int main(int argc, char **argv) { uint64_t i; uint64_t vecLen; uint64_t *a1, *a2, *a3; // check command line args if (argc == 1) { vecLen = 100; // default vecLen } else if (argc == 2) { vecLen = atoi(argv[1]); if (vecLen <= 0) { usage(argv[0]); return 0; } } else { usage(argv[0]); return 0; } printf("Running with vecLen = %llu\n", (long long)vecLen); printf("Initializing arrays\n"); fflush(stdout); a1 = (uint64_t *)malloc(vecLen * sizeof(uint64_t)); a2 = (uint64_t *)malloc(vecLen * sizeof(uint64_t)); a3 = (uint64_t *)malloc(vecLen * sizeof(uint64_t)); memset(a3, 0, vecLen * 8); for (i = 0; i < vecLen; i++) { a1[i] = i; a2[i] = 2 * i; } CHtHif *pHtHif = new CHtHif(); int unitCnt = pHtHif->GetUnitCnt(); printf("#AUs = %d\n", unitCnt); // Coprocessor memory arrays uint64_t * cp_a1 = (uint64_t*)pHtHif->MemAllocAlign(4 * 1024, vecLen * sizeof(uint64_t)); uint64_t * cp_a2 = (uint64_t*)pHtHif->MemAllocAlign(4 * 1024, vecLen * sizeof(uint64_t)); uint64_t * cp_a3 = (uint64_t*)pHtHif->MemAllocAlign(4 * 1024, vecLen * sizeof(uint64_t)); pHtHif->MemCpy(cp_a1, a1, vecLen * sizeof(uint64_t)); pHtHif->MemCpy(cp_a2, a2, vecLen * sizeof(uint64_t)); pHtHif->MemSet(cp_a3, 0, vecLen * sizeof(uint64_t)); CHtAuUnit ** pAuUnits = new CHtAuUnit * [unitCnt]; for (int unit = 0; unit < unitCnt; unit++) pAuUnits[unit] = new CHtAuUnit(pHtHif); // avoid bank aliasing for performance if (unitCnt > 16 && !(unitCnt & 1)) unitCnt -= 1; printf("stride = %d\n", unitCnt); fflush(stdout); pHtHif->SendAllHostMsg(OP1_ADDR, (uint64_t)cp_a1); pHtHif->SendAllHostMsg(OP2_ADDR, (uint64_t)cp_a2); pHtHif->SendAllHostMsg(RES_ADDR, (uint64_t)cp_a3); pHtHif->SendAllHostMsg(VEC_LEN, (uint64_t)vecLen); for (int unit = 0; unit < unitCnt; unit++) pAuUnits[unit]->SendCall_htmain(unit /*offset*/, unitCnt /*stride*/); uint64_t act_sum = 0; uint64_t au_sum; for (int unit = 0; unit < unitCnt; unit++) { while (!pAuUnits[unit]->RecvReturn_htmain(au_sum)) usleep(1000); printf("unit=%-2d: au_sum %llu \n", unit, (long long)au_sum); fflush(stdout); act_sum += au_sum; } printf("RTN: act_sum = %llu\n", (long long)act_sum); pHtHif->MemCpy(a3, cp_a3, vecLen * sizeof(uint64_t)); // check results int err_cnt = 0; uint64_t exp_sum = 0; for (i = 0; i < vecLen; i++) { if (a3[i] != a1[i] + a2[i]) { printf("a3[%llu] is %llu, should be %llu\n", (long long)i, (long long)a3[i], (long long)(a1[i] + a2[i])); err_cnt++; } exp_sum += a1[i] + a2[i]; //printf("i=%llu: a1=%llu + a2=%llu => a3=%llu\n", // (long long)i, (long long)a1[i], (long long)a2[i], (long long)a3[i]); } if (act_sum != exp_sum) { printf("act_sum %llu != exp_sum %llu\n", (long long)act_sum, (long long)exp_sum); err_cnt++; } if (err_cnt) printf("FAILED: detected %d issues!\n", err_cnt); else printf("PASSED\n"); // free memory free(a1); free(a2); free(a3); pHtHif->MemFreeAlign(cp_a1); pHtHif->MemFreeAlign(cp_a2); pHtHif->MemFreeAlign(cp_a3); delete pHtHif; return err_cnt; }