Beispiel #1
0
int main(int argc, char **argv)
{
	uint32_t rows, cols;
	uint32_t *a1, *a3, *e3;

	int const radius = STENCIL_RADIUS;

	// check command line args
	if (argc == 1) {
		rows = 100;  // default rows
		cols = 100;  // default cols
	} else if (argc == 3) {
		rows = atoi(argv[1]);
		if (rows <= 0 || rows > 1023) {
			usage(argv[1]);
			return 0;
		}
		cols = atoi(argv[2]);
		if (cols <= 0 || cols > 1023) {
			usage(argv[2]);
			return 0;
		}
	} else {
		usage(argv[0]);
		return 0;
	}

	printf("Running with rows = %u, cols = %u\n", rows, cols);
	printf("Initializing arrays\n");
	fflush(stdout);

	a1 = (uint32_t *)malloc((rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t));
	a3 = (uint32_t *)malloc((rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t));
	memset(a1, 0, (rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t));
	memset(a3, 0, (rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t));

	for (uint32_t r = 0; r < rows; r++)
		for (uint32_t c = 0; c < cols; c++) {
			a1[(r+radius)*(cols + radius * 2)+(c+radius)] = ((r+radius) << 8) | (c+radius);
	}

	CHtHif *pHtHif = new CHtHif();
	int unitCnt = pHtHif->GetUnitCnt();
	printf("#AUs = %d\n", unitCnt);

	CHtAuUnit ** pAuUnits = new CHtAuUnit * [unitCnt];
	for (int unit = 0; unit < unitCnt; unit++)
		pAuUnits[unit] = new CHtAuUnit(pHtHif);

	// Coprocessor memory arrays
	uint32_t *cp_a1 = (uint32_t*)pHtHif->MemAlloc((rows+radius*2) * (cols+radius*2) * sizeof(uint32_t));
	uint32_t *cp_a3 = (uint32_t*)pHtHif->MemAlloc((rows+radius*2) * (cols+radius*2) * sizeof(uint32_t));
	if (!cp_a1 || !cp_a3) {
		fprintf(stderr, "ht_cp_malloc() failed.\n");
		exit(-1);
	}
	pHtHif->MemCpy(cp_a1, a1, (rows+radius*2) * (cols+radius*2) * sizeof(uint32_t));
	pHtHif->MemSet(cp_a3, 0, (rows+radius*2) * (cols+radius*2) * sizeof(uint32_t));

	// avoid bank aliasing for performance
	if (unitCnt > 16 && !(unitCnt & 1)) unitCnt -= 1;
	printf("stride = %d\n", unitCnt);

	fflush(stdout);

	// Send calls to units
	uint32_t rowsPerUnit = (cols + unitCnt - 1) / unitCnt;
	for (int unit = 0; unit < unitCnt; unit++) {
		uint32_t * pSrcAddr = cp_a1 + unit * (cols+radius*2) * rowsPerUnit;
		uint32_t * pDstAddr = cp_a3 + unit * (cols+radius*2) * rowsPerUnit;

		uint32_t unitRows = (unit+1)*rowsPerUnit > rows ? (rows - unit*rowsPerUnit) : rowsPerUnit;

		pAuUnits[unit]->SendCall_htmain((uint64_t)pSrcAddr, (uint64_t)pDstAddr, unitRows, cols);
	}

	// generate expected results while waiting for returns
	e3 = (uint32_t *)malloc((rows+radius*2) * (cols+radius*2) * sizeof(uint32_t));
	memset(e3, 0, (rows + radius * 2) * (cols + radius * 2) * sizeof(uint32_t));

	uint32_t coef[5] = { STENCIL_COEF2, STENCIL_COEF1, STENCIL_COEF0/2, STENCIL_COEF1, STENCIL_COEF2 };

	for (uint32_t row = radius; row < rows+radius; row++) {
		for (uint32_t col = radius; col < cols+radius; col++) {
			uint32_t rslt = 0;
			for (uint32_t c = col - STENCIL_RADIUS; c <= col + STENCIL_RADIUS; c++)
				rslt += a1[row*(cols+radius*2) + c] * coef[c - col + STENCIL_RADIUS];

			for (uint32_t r = row - STENCIL_RADIUS; r <= row + STENCIL_RADIUS; r++)
				rslt += a1[r*(cols+radius*2) + col] * coef[r - row + STENCIL_RADIUS];

			e3[row*(cols+radius*2) + col] = rslt >> 8;
		}
	}

	// Wait for returns
	for (int unit = 0; unit < unitCnt; unit++) {
		while (!pAuUnits[unit]->RecvReturn_htmain())
			usleep(1000);
	}

	pHtHif->MemCpy(a3, cp_a3, (rows+radius*2) * (cols+radius*2) * sizeof(uint32_t));

	// check results
	int err_cnt = 0;
	for (uint32_t col = 0; col < (cols+radius*2); col++) {
		for (uint32_t row = 0; row < (rows+radius*2); row++) {
			if (a3[row*(cols+radius*2) + col] != e3[row*(cols+radius*2) + col]) {
				printf("a3[row=%u, col=%u] is %u, should be %u\n",
					   row, col, a3[row*(cols+radius*2) + col], e3[row*(cols+radius*2) + col]);
				err_cnt++;
			}
		}
	}

	if (err_cnt)
		printf("FAILED: detected %d issues!\n", err_cnt);
	else
		printf("PASSED\n");

	// free memory
	free(a1);
	free(a3);
	pHtHif->MemFree(cp_a1);
	pHtHif->MemFree(cp_a3);

	delete pHtHif;

	return err_cnt;
}
Beispiel #2
0
int main(int argc, char **argv) {

	int debug = 0;
	int mat_int_len, temp;

#ifdef HT_MODEL
	default_size = 15;
#endif

#ifdef HT_SYSC
	default_size = 15;
#endif

#ifdef HT_VSIM
	default_size = 3;
#endif

	uint64_t i, j, k;
	uint64_t aRow, aCol, bRow, bCol;
	uint64_t *a1, *a2, *a3;

	// check command line args
	if (argc <= 2) {
		// Defaults
		aRow = default_size;
		aCol = default_size;
		bRow = default_size;
		bCol = default_size;
		if (argc==2) {
			if (atoi(argv[1]) == 1 || atoi(argv[1]) == 0) {
				debug = atoi(argv[1]);
			} else {
				usage(argv[0]);
			}
			debug = atoi(argv[1])==1 ? 1 : 0;
		}

	} else if (argc == 5 || argc == 6) {
		// Grab Command Line Values
		aRow = atoi(argv[1]);
		aCol = atoi(argv[2]);
		bRow = atoi(argv[3]);
		bCol = atoi(argv[4]);
		if (argc==6)
			debug = atoi(argv[5])==1 ? 1 : 0;

		if (aRow <= 0 || aCol <= 0 || bRow <= 0 || bCol <= 0) {
			usage(argv[0]);
			return 0;
		}
	} else {
		usage(argv[0]);
		return 0;
	}

	if (aCol != bRow) {
		printf("ERROR: Number of columns in Matrix A (%lld) does not equal the number of rows in Matrix B (%lld)\n\n",
			(long long)aCol, (long long)bRow);
		exit(1);
	}

	printf("Running with: Matrix A: (%lldx%lld) and Matrix B (%lldx%lld)\n",
		(long long)aRow, (long long)aCol, (long long)bRow, (long long)bCol);
	printf("Initializing arrays\n");
	fflush(stdout);

	a1 = (uint64_t *)malloc(aRow * aCol * 8);
	a2 = (uint64_t *)malloc(bRow * bCol * 8);
	a3 = (uint64_t *)malloc(aRow * bCol * 8);
	memset(a3, 0, aRow * bCol * 8);

	/* 
	* Matrix A and B are stored differently in memory (to make reads more efficient?)
	* The following are 3x3 examples: (A counts from 8 -> 0, B counts from 0 -> 8)
	*
	* Actual Numbers:         Memory Locations (by Index)
	* 
	* A: 8  7  6              A: 0  3  6
	*    5  4  5                 1  4  7
	*    2  1  0                 2  5  8
	*
	* B: 0  1  2              B: 0  1  2
	*    3  4  5                 3  4  5
	*    6  7  8                 6  7  8
	*
	* I did this to try to make A Rows available on strides, as well as B Columns on strides...
	*
	*/

	// Fill Matrix A
	k = 0;
	for (i = 0; i < aCol; i++) {
		for (j = aRow; j > 0; j--) {
			a1[k] = (aRow*aCol)-aCol*(aRow-j)-i-1;
			k++;
		}
	}

	// Fill Matrix B
	for (i = 0; i < bRow*bCol; i++) {
		a2[i] = i;
	}

	//Print Matrices
	mat_int_len = 1;
	temp = 0;
	for(i = 0; i < aRow*bCol; i++) {
		temp = num_length(a1[i]);
		mat_int_len = (temp > mat_int_len) ? temp : mat_int_len;
	}
	printf("Matrix A:\n");
	for (i = 0; i < aRow; i++) {
		for (j = 0; j < aCol; j++) {
			printf("%*lld ", mat_int_len, (long long)a1[i+aRow*j]);
		}
		printf("\n");
	}
	printf("\n\n");

	mat_int_len = 1;
	temp = 0;
	for(i = 0; i < aRow*bCol; i++) {
		temp = num_length(a1[i]);
		mat_int_len = (temp > mat_int_len) ? temp : mat_int_len;
	}
	printf("Matrix B:\n");
	for(i = 0; i < bRow*bCol; i++) {
		if (i > 0) {
			if (i%bCol == 0) {
				printf("\n");
			}
		}
		printf("%*lld ", mat_int_len, (long long)a2[i]);
	}
	printf("\n\n");


	// Debug - Print Matrix Values at memory locations
	if (debug) {
		printf("A - MEM\n");
		for (i = 0; i < aRow*aCol; i++) {
			printf("%lld - %lld\n", (long long)i, (long long)a1[i]);
		}

		printf("B - MEM\n");
		for (i = 0; i < bRow*bCol; i++) {
			printf("%lld - %lld\n", (long long)i, (long long)a2[i]);
		}
	}

	CHtHif *pHtHif = new CHtHif();

	// Coprocessor memory arrays
	uint64_t *cp_a1 = (uint64_t *)pHtHif->MemAllocAlign(4 * 1024, aRow * aCol * sizeof(uint64_t));
	uint64_t *cp_a2 = (uint64_t *)pHtHif->MemAllocAlign(4 * 1024, bRow * bCol * sizeof(uint64_t));
	uint64_t *cp_a3 = (uint64_t *)pHtHif->MemAllocAlign(4 * 1024, aRow * bCol * sizeof(uint64_t));

	pHtHif->MemCpy(cp_a1, a1, aRow * aCol * sizeof(uint64_t));
	pHtHif->MemCpy(cp_a2, a2, bRow * bCol * sizeof(uint64_t));
	pHtHif->MemSet(cp_a3, 0, aRow * bCol * sizeof(uint64_t));

	int unitCnt = pHtHif->GetUnitCnt();
	CHtAuUnit ** pAuUnits = new CHtAuUnit * [unitCnt];

	for (int unitId = 0; unitId < unitCnt; unitId += 1)
		pAuUnits[unitId] = new CHtAuUnit(pHtHif);

	printf("#AUs = %d\n", unitCnt);

	// avoid bank aliasing for performance
	if (unitCnt > 16 && !(unitCnt & 1)) unitCnt -= 1;
	printf("stride = %d\n", unitCnt);

	fflush(stdout);

	pHtHif->SendAllHostMsg(MA_BASE, (uint64_t)cp_a1);
	pHtHif->SendAllHostMsg(MB_BASE, (uint64_t)cp_a2);
	pHtHif->SendAllHostMsg(MC_BASE, (uint64_t)cp_a3);
	pHtHif->SendAllHostMsg(MC_ROW, (uint32_t)aRow);
	pHtHif->SendAllHostMsg(MC_COL, (uint32_t)bCol);
	pHtHif->SendAllHostMsg(COMMON, (uint32_t)aCol);

	for (int unit = 0; unit < unitCnt; unit++)
		pAuUnits[unit]->SendCall_htmain(unit /*rowOffset*/, unitCnt /*stride*/);

	for (int unit = 0; unit < unitCnt; unit++) {
		while (!pAuUnits[unit]->RecvReturn_htmain())
			usleep(1000);
		fflush(stdout);
	}

	pHtHif->MemCpy(a3, cp_a3, aRow * bCol * sizeof(uint64_t));

	// Print Resulting Matrix
	mat_int_len = 1;
	temp = 0;
	for(i = 0; i < aRow*bCol; i++) {
		temp = num_length(a3[i]);
		mat_int_len = (temp > mat_int_len) ? temp : mat_int_len;
	}

	printf("\nMatrix C:\n");
	for(i = 0; i < aRow*bCol; i++) {
		if (i > 0) {
			if (i%bCol == 0) {
				printf("\n");
			}
		}
		printf("%*lld ", mat_int_len, (long long)a3[i]);
	}
	printf("\n\n");

	if (debug) {
		printf("C - MEM\n");
		for (i = 0; i < aRow*bCol; i++) {
			printf("%lld - %lld\n", (long long)i, (long long)a3[i]);
		}
	}

	// Do error checking
	int err_cnt = 0;
	uint64_t rowNum = 0, colNum = 0, calcNum = 0, eleNum = 0;
	uint64_t *val;

	val = (uint64_t *)malloc(aRow * bCol * 8);
	memset(val, 0, aRow * bCol * 8);

	// Calculate the resulting matrix to check against coprocessor results
	for (rowNum = 0; rowNum < aRow; rowNum++) {

		for (colNum = 0; colNum < bCol; colNum++) {

			for (calcNum = 0; calcNum < aCol; calcNum++) {

				val[eleNum] += a1[rowNum+(calcNum*aRow)] * a2[colNum+(calcNum*bCol)];

			}

			eleNum++;

		}

	}

	// Check results
	for (eleNum = 0; eleNum < aRow*bCol; eleNum++) {

		if (val[eleNum] != a3[eleNum]) {
			err_cnt++;
			printf("Found element mismatch at matrix position %lld - found value %lld, expected value %lld.\n",
				(unsigned long long)eleNum, (unsigned long long)a3[eleNum], (unsigned long long)val[eleNum]);
		}

	}

	if (err_cnt == 0) {
		// Test Passed
		printf("PASSED\n\n");
	} else {
		// Test Failed
		printf("FAILED - error count %d\n\n", err_cnt);
	}

	// free memory
	free(a1);
	free(a2);
	free(a3);
	pHtHif->MemFreeAlign(cp_a1);
	pHtHif->MemFreeAlign(cp_a2);
	pHtHif->MemFreeAlign(cp_a3);
	free(val);

	delete pHtHif;

	return err_cnt;
}
Beispiel #3
0
int main(int argc, char **argv)
{
	uint64_t i;
	uint64_t vecLen;
	uint64_t *a1, *a2, *a3;

	// check command line args
	if (argc == 1) {
		vecLen = 100;  // default vecLen
	} else if (argc == 2) {
		vecLen = atoi(argv[1]);
		if (vecLen <= 0) {
			usage(argv[0]);
			return 0;
		}
	} else {
		usage(argv[0]);
		return 0;
	}

	printf("Running with vecLen = %llu\n", (long long)vecLen);
	printf("Initializing arrays\n");
	fflush(stdout);

	a1 = (uint64_t *)malloc(vecLen * sizeof(uint64_t));
	a2 = (uint64_t *)malloc(vecLen * sizeof(uint64_t));
	a3 = (uint64_t *)malloc(vecLen * sizeof(uint64_t));
	memset(a3, 0, vecLen * 8);

	for (i = 0; i < vecLen; i++) {
		a1[i] = i;
		a2[i] = 2 * i;
	}

	CHtHif *pHtHif = new CHtHif();
	int unitCnt = pHtHif->GetUnitCnt();
	printf("#AUs = %d\n", unitCnt);

	// Coprocessor memory arrays
	uint64_t * cp_a1 = (uint64_t*)pHtHif->MemAllocAlign(4 * 1024, vecLen * sizeof(uint64_t));
	uint64_t * cp_a2 = (uint64_t*)pHtHif->MemAllocAlign(4 * 1024, vecLen * sizeof(uint64_t));
	uint64_t * cp_a3 = (uint64_t*)pHtHif->MemAllocAlign(4 * 1024, vecLen * sizeof(uint64_t));

	pHtHif->MemCpy(cp_a1, a1, vecLen * sizeof(uint64_t));
	pHtHif->MemCpy(cp_a2, a2, vecLen * sizeof(uint64_t));
	pHtHif->MemSet(cp_a3, 0, vecLen * sizeof(uint64_t));

	CHtAuUnit ** pAuUnits = new CHtAuUnit * [unitCnt];
	for (int unit = 0; unit < unitCnt; unit++)
		pAuUnits[unit] = new CHtAuUnit(pHtHif);

	// avoid bank aliasing for performance
	if (unitCnt > 16 && !(unitCnt & 1)) unitCnt -= 1;
	printf("stride = %d\n", unitCnt);

	fflush(stdout);

	pHtHif->SendAllHostMsg(OP1_ADDR, (uint64_t)cp_a1);
	pHtHif->SendAllHostMsg(OP2_ADDR, (uint64_t)cp_a2);
	pHtHif->SendAllHostMsg(RES_ADDR, (uint64_t)cp_a3);
	pHtHif->SendAllHostMsg(VEC_LEN, (uint64_t)vecLen);

	for (int unit = 0; unit < unitCnt; unit++)
		pAuUnits[unit]->SendCall_htmain(unit /*offset*/, unitCnt /*stride*/);

	uint64_t act_sum = 0;
	uint64_t au_sum;
	for (int unit = 0; unit < unitCnt; unit++) {
		while (!pAuUnits[unit]->RecvReturn_htmain(au_sum))
			usleep(1000);
		printf("unit=%-2d: au_sum %llu \n", unit, (long long)au_sum);
		fflush(stdout);
		act_sum += au_sum;
	}

	printf("RTN: act_sum = %llu\n", (long long)act_sum);

	pHtHif->MemCpy(a3, cp_a3, vecLen * sizeof(uint64_t));

	// check results
	int err_cnt = 0;
	uint64_t exp_sum = 0;
	for (i = 0; i < vecLen; i++) {
		if (a3[i] != a1[i] + a2[i]) {
			printf("a3[%llu] is %llu, should be %llu\n",
			       (long long)i, (long long)a3[i], (long long)(a1[i] + a2[i]));
			err_cnt++;
		}
		exp_sum += a1[i] + a2[i];
		//printf("i=%llu:  a1=%llu + a2=%llu => a3=%llu\n",
		//	(long long)i, (long long)a1[i], (long long)a2[i], (long long)a3[i]);
	}
	if (act_sum != exp_sum) {
		printf("act_sum %llu != exp_sum %llu\n", (long long)act_sum, (long long)exp_sum);
		err_cnt++;
	}

	if (err_cnt)
		printf("FAILED: detected %d issues!\n", err_cnt);
	else
		printf("PASSED\n");

	// free memory
	free(a1);
	free(a2);
	free(a3);
	pHtHif->MemFreeAlign(cp_a1);
	pHtHif->MemFreeAlign(cp_a2);
	pHtHif->MemFreeAlign(cp_a3);

	delete pHtHif;

	return err_cnt;
}