inline void matmul_matmul(size_t heightA, size_t widthA, size_t widthB, data_t *A, size_t rstrideA, data_t *B, size_t rstrideB, data_t *C, size_t rstrideC) { // I dont want to use MKL here, please call directly matmul_mkl if(heightA * widthB < kMinStrassen) matmul_simple(heightA, widthA, widthB, A, rstrideA, B, rstrideB, C, rstrideC); else matmul_strassen(heightA, widthA, widthB, A, rstrideA, B, rstrideB, C, rstrideC); }
int main(int argc, char *argv[]) { e_epiphany_t Epiphany, *pEpiphany; e_mem_t DRAM, *pDRAM; unsigned int msize; float seed; unsigned int addr; //, clocks; size_t sz; double tdiff[4]; int result, rerval; pEpiphany = &Epiphany; pDRAM = &DRAM; msize = 0x00400000; get_args(argc, argv); fo = stderr; fi = stdin; printf("\nMatrix: C[%d][%d] = A[%d][%d] * B[%d][%d]\n\n", _Smtx, _Smtx, _Smtx, _Smtx, _Smtx, _Smtx); printf("Using %d x %d cores\n\n", _Nside, _Nside); seed = 0.0; printf("Seed = %f\n", seed); // Connect to device for communicating with the Epiphany system // Prepare device e_set_host_verbosity(H_D0); e_init(NULL); e_reset_system(); if (e_alloc(pDRAM, 0x00000000, msize)) { printf("\nERROR: Can't allocate Epiphany DRAM!\n\n"); exit(1); } if (e_open(pEpiphany, 0, 0, e_platform.chip[0].rows, e_platform.chip[0].cols)) { printf("\nERROR: Can't establish connection to Epiphany device!\n\n"); exit(1); } // Initialize Epiphany "Ready" state addr = offsetof(shared_buf_t, core.ready); Mailbox.core.ready = 0; e_write(pDRAM, 0, 0, addr, &Mailbox.core.ready, sizeof(Mailbox.core.ready)); printf("Loading program on Epiphany chip...\n"); e_set_loader_verbosity(ar.verbose); result = e_load_group(ar.srecFile, pEpiphany, 0, 0, pEpiphany->rows, pEpiphany->cols, ar.run_target); if (result == E_ERR) { printf("Error loading Epiphany program.\n"); exit(1); } // Generate operand matrices based on a provided seed matrix_init(seed); #ifdef __WIPE_OUT_RESULT_MATRIX__ // Wipe-out any previous remains in result matrix (for verification) addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); printf("Writing C[%uB] to address %08x...\n", sz, addr); e_write(pDRAM, 0, 0, addr, (void *) Mailbox.C, sz); #endif clock_gettime(CLOCK_MONOTONIC, &timer[0]); // Copy operand matrices to Epiphany system addr = offsetof(shared_buf_t, A[0]); sz = sizeof(Mailbox.A); printf("Writing A[%uB] to address %08x...\n", sz, addr); e_write(pDRAM, 0, 0, addr, (void *) Mailbox.A, sz); addr = offsetof(shared_buf_t, B[0]); sz = sizeof(Mailbox.B); printf("Writing B[%uB] to address %08x...\n", sz, addr); e_write(pDRAM, 0, 0, addr, (void *) Mailbox.B, sz); // Call the Epiphany matmul() function printf("GO Epiphany! ... "); clock_gettime(CLOCK_MONOTONIC, &timer[1]); matmul_go(pDRAM); clock_gettime(CLOCK_MONOTONIC, &timer[2]); printf("Finished calculating Epiphany result.\n"); // Read result matrix and timing addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); printf("Reading result from address %08x...\n", addr); e_read(pDRAM, 0, 0, addr, (void *) Mailbox.C, sz); clock_gettime(CLOCK_MONOTONIC, &timer[3]); // Calculate a reference result printf("Calculating result on Host ... "); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[4]); #ifndef __DO_STRASSEN__ matmul(Mailbox.A, Mailbox.B, Cref, _Smtx); #else matmul_strassen(Mailbox.A, Mailbox.B, Cref, _Smtx); #endif clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[5]); printf("Finished calculating Host result.\n"); addr = offsetof(shared_buf_t, core.clocks); sz = sizeof(Mailbox.core.clocks); printf("Reading time from address %08x...\n", addr); e_read(pDRAM,0, 0, addr, &Mailbox.core.clocks, sizeof(Mailbox.core.clocks)); // clocks = Mailbox.core.clocks; // Calculate the difference between the Epiphany result and the reference result printf("\n*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n"); printf("Verifying result correctness ... "); matsub(Mailbox.C, Cref, Cdiff, _Smtx); tdiff[0] = (timer[2].tv_sec - timer[1].tv_sec) * 1000 + ((double) (timer[2].tv_nsec - timer[1].tv_nsec) / 1000000.0);//total tdiff[1] = (timer[1].tv_sec - timer[0].tv_sec) * 1000 + ((double) (timer[1].tv_nsec - timer[0].tv_nsec) / 1000000.0);//write tdiff[2] = (timer[3].tv_sec - timer[2].tv_sec) * 1000 + ((double) (timer[3].tv_nsec - timer[2].tv_nsec) / 1000000.0);//read tdiff[3] = (timer[5].tv_sec - timer[4].tv_sec) * 1000 + ((double) (timer[5].tv_nsec - timer[4].tv_nsec) / 1000000.0);//ref // If the difference is 0, then the matrices are identical and the // calculation was correct if (iszero(Cdiff, _Smtx)) { printf("C_epiphany == C_host\n"); rerval = 0; } else { printf("\n\nERROR: C_epiphany is different from C_host !!!\n"); rerval = 1; } printf("*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n"); printf("\n"); printf("Epiphany (compute): %9.1f msec (@ %03d MHz)\n" , tdiff[0], eMHz); printf(" (write) : %9.1f msec \n" , tdiff[1]); printf(" (read) : %9.1f msec\n" , tdiff[2]); printf(" (*total*): %9.1f msec\n\n" , tdiff[2]+tdiff[1]+tdiff[0]); printf("Host (*total*): %9.1f msec (@ %03d MHz)\n" , tdiff[3], aMHz); #ifdef __DUMP_MATRICES__ printf("\n\n\n"); printf("A[][] = \n"); matprt(Mailbox.A, _Smtx); printf("B[][] = \n"); matprt(Mailbox.B, _Smtx); printf("C[][] = \n"); matprt(Mailbox.C, _Smtx); printf("Cref[][] = \n"); matprt(Cref, _Smtx); int i, j; for (i=0; i<_Nside; i++) for (j=0; j<_Nside; j++) { e_read(pEpiphany, i, j, 0x2000+0*sizeof(float), &Aepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x2000+2*sizeof(float), &Aepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+0*sizeof(float), &Bepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+2*sizeof(float), &Bepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); } printf("Aepi[][] = \n"); matprt(Aepi, _Smtx); printf("Bepi[][] = \n"); matprt(Bepi, _Smtx); #endif printf("\n* * * EPIPHANY FTW !!! * * *\n"); // Close connection to device if (e_close(pEpiphany)) { printf("\nERROR: Can't close connection to Epiphany device!\n\n"); exit(1); } if (e_free(pDRAM)) { printf("\nERROR: Can't release Epiphany DRAM!\n\n"); exit(1); } e_finalize(); return rerval; }
int main(int argc, char *argv[]) { p_mem_t shared_mem, results_mem; uint32_t eram_base; char results[1024] = { '\0' }; int device_cols, device_rows, nside; p_dev_t dev; p_prog_t prog; p_team_t team; p_coords_t size; p_coords_t start = { .row = 0, .col = 0 }; unsigned int msize; float seed; unsigned int addr; //, clocks; size_t sz; int verbose=0; double tdiff[3]; int result, retval = 0; msize = 0x00400000; get_args(argc, argv); fo = stderr; fi = stdin; printf( "------------------------------------------------------------\n"); printf( "Calculating: C[%d][%d] = A[%d][%d] * B[%d][%d]\n", _Smtx, _Smtx, _Smtx, _Smtx, _Smtx, _Smtx); seed = 0.0; if(verbose){ printf( "Seed = %f\n", seed); } dev = p_init(P_DEV_EPIPHANY, 0); if (p_error(dev)) { fprintf(stderr, "Error initializing PAL\n"); return p_error(dev); } device_cols = p_query(dev, P_PROP_COLS); device_rows = p_query(dev, P_PROP_ROWS); // Use min size nside = device_cols > device_rows ? device_cols : device_rows; if (nside < 4) { fprintf(stderr, "Error: Too small device, need at least 4x4\n"); return 1; } // Either 1024, 256, 64, or 16 cores (side must be power of two), nside = nside >= 32 ? 32 : nside >= 16 ? 16 : nside >= 8 ? 8 : 4; size.row = nside; size.col = nside; team = p_open4(dev, P_TOPOLOGY_2D, &start, &size); printf("Using team of size %d\n", p_team_size(team)); if (p_error(team)) { fprintf(stderr, "Error opening team\n"); return p_error(team); } prog = p_load(dev, ar.elfFile, 0); eram_base = (unsigned) p_query(dev, P_PROP_MEMBASE); shared_mem = p_map(dev, eram_base, msize); // Clear mailbox contents memset(&Mailbox, 0, sizeof(Mailbox)); p_write(&shared_mem, &Mailbox, 0, sizeof(Mailbox), 0); // Generate operand matrices based on a provided seed matrix_init((int)seed); #ifdef __WIPE_OUT_RESULT_MATRIX__ // Wipe-out any previous remains in result matrix (for verification) addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); if(verbose){ printf( "Writing C[%uB] to address %08x...\n", (unsigned) sz, addr); } p_write(&shared_mem, (void *) Mailbox.C, addr, sz, 0); #endif /* Wallclock time */ clock_gettime(CLOCK_MONOTONIC, &timer[0]); /* Clock CPUTIME too. We don't want to indicate failure just * because the system was under high load. */ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[4]); // Copy operand matrices to Epiphany system addr = offsetof(shared_buf_t, A[0]); sz = sizeof(Mailbox.A); if(verbose){ printf( "Writing A[%uB] to address %08x...\n", (unsigned) sz, addr); } p_write(&shared_mem, (void *) Mailbox.A, addr, sz, 0); addr = offsetof(shared_buf_t, B[0]); sz = sizeof(Mailbox.B); if(verbose){ printf( "Writing B[%uB] to address %08x...\n", (unsigned) sz, addr); } p_write(&shared_mem, (void *) Mailbox.B, addr, sz, 0); // Call the Epiphany matmul() function if(verbose){ printf( "GO Epiphany! ... "); } if(verbose){ printf("Loading program on Epiphany chip...\n"); } p_arg_t args[] = { &nside, sizeof(nside), true }; if (p_run(prog, "matmul", team, 0, p_team_size(team), 1, args, 0)) { fprintf(stderr, "Error loading Epiphany program.\n"); exit(1); } // Read result matrix and timing addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); if(verbose){ printf( "Reading result from address %08x...\n", addr); } p_read(&shared_mem, (void *) Mailbox.C, addr, sz, 0); clock_gettime(CLOCK_MONOTONIC, &timer[1]); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[5]); // Calculate a reference result clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[2]); #ifndef __DO_STRASSEN__ matmul(Mailbox.A, Mailbox.B, Cref, _Smtx); #else matmul_strassen(Mailbox.A, Mailbox.B, Cref, _Smtx); #endif clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[3]); addr = offsetof(shared_buf_t, core.clocks); sz = sizeof(Mailbox.core.clocks); if(verbose){ printf( "Reading time from address %08x...\n", addr); } p_read(&shared_mem, &Mailbox.core.clocks, addr, sizeof(Mailbox.core.clocks), 0); // clocks = Mailbox.core.clocks; // Calculate the difference between the Epiphany result and the reference result matsub(Mailbox.C, Cref, Cdiff, _Smtx); tdiff[0] = (timer[1].tv_sec - timer[0].tv_sec) * 1000 + ((double) (timer[1].tv_nsec - timer[0].tv_nsec) / 1000000.0); // tdiff[0] = ((double) clocks) / eMHz * 1000; tdiff[1] = (timer[3].tv_sec - timer[2].tv_sec) * 1000 + ((double) (timer[3].tv_nsec - timer[2].tv_nsec) / 1000000.0); tdiff[2] = (timer[5].tv_sec - timer[4].tv_sec) * 1000 + ((double) (timer[5].tv_nsec - timer[4].tv_nsec) / 1000000.0); // If the difference is 0, then the matrices are identical and the // calculation was correct if (iszero(Cdiff, _Smtx)) { printf( "Epiphany(time) %9.1f msec (@ %03d MHz)\n", tdiff[0], eMHz); printf( "Host(time) %9.1f msec (@ %03d MHz)\n", tdiff[1], aMHz); printf( "------------------------------------------------------------\n"); printf( "TEST \"matmul-16\" PASSED\n"); retval = 0; } else { printf( "\n\nERROR: C_epiphany is different from C_host !!!\n"); printf( "TEST \"matmul-16\" FAILED\n"); retval = 1; } #if 0 #ifdef __DUMP_MATRICES__ printf( "\n\n\n"); printf( "A[][] = \n"); matprt(Mailbox.A, _Smtx); printf( "B[][] = \n"); matprt(Mailbox.B, _Smtx); printf( "C[][] = \n"); matprt(Mailbox.C, _Smtx); printf( "Cref[][] = \n"); matprt(Cref, _Smtx); int i, j; for (i=0; i<_Nside; i++) for (j=0; j<_Nside; j++) { e_read(pEpiphany, i, j, 0x2000+0*sizeof(float), &Aepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x2000+2*sizeof(float), &Aepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+0*sizeof(float), &Bepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+2*sizeof(float), &Bepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); } printf( "Aepi[][] = \n"); matprt(Aepi, _Smtx); printf( "Bepi[][] = \n"); matprt(Bepi, _Smtx); #endif #endif // p_unmap ... p_close(team); p_finalize(dev); return retval; } // Initialize operand matrices void matrix_init(int seed) { int i, j, p; p = 0; for (i=0; i<_Smtx; i++) for (j=0; j<_Smtx; j++) Mailbox.A[p++] = (i + j + seed) % _MAX_MEMBER_; p = 0; for (i=0; i<_Smtx; i++) for (j=0; j<_Smtx; j++) Mailbox.B[p++] = ((i + j) * 2 + seed) % _MAX_MEMBER_; p = 0; for (i=0; i<_Smtx; i++) for (j=0; j<_Smtx; j++) Mailbox.C[p++] = 0x8dead; return; }