int main(int argc, char *argv[]) { //%% EXERCISE: Take note of this return argument name double avg = -0.1, diff; // Variable to hold the average of the scalars. HSTR_EVENT out_Event; // Tracks completion of sink-side computation //-------------------------------------------------------------- // Set up initialization parameters // Number of streams to create in each domain: 1 int streams_per_domain = 1; // no over-subscription, i.e., one logical stream per physical stream. int oversubscription = 1; CHECK_HSTR_RESULT(hStreams_app_init(streams_per_domain, oversubscription)); //-------------------------------------------------------------- //-------------------------------------------------------------- //!!a Set up 2 scalar arguments instead of one, no heap arguments const int num_scalar_values_to_send = 2; const int num_heap_values_to_send = 0; // 1st scalar value to pass to sink side. int a = 1000; // 2nd scalar value to pass to sink side. int b = 500; //%% Optional: Add additional variables as a practice. // You can pass these values from command line. if (argc > 1) { a = atoi(argv[1]); } if (argc > 2) { b = atoi(argv[2]); //%% Optional: add more, if you like } // Arguments to pass to sink side. You need to pass two arguments. uint64_t args[num_scalar_values_to_send + num_heap_values_to_send]; // Pack the first argument. args[0] = (uint64_t)(a); //Pack the second argument. args[1] = (uint64_t)(b); //%% Optional: Pack other arguments you want to send. //-------------------------------------------------------------- //-------------------------------------------------------------- // Invoke the "average_sink_1" app on the sink side on stream 0. // NOTE: sizes should match to the actual allocated size, otherwise you will get segfaults CHECK_HSTR_RESULT(hStreams_app_invoke( 0, // stream_id = 0 "average_sink_1", // sink-side function name num_scalar_values_to_send, //!!a Changed above, as a variable num_heap_values_to_send, // number of heap arguments args, // arguments array &out_Event, // completion event // used to check result availability //%% EXERCISE: replace <return variable> with the name of the return arg &avg, //!!b Point to the blob to return sizeof(double))); //!!b Give the blob size to return //-------------------------------------------------------------- //-------------------------------------------------------------- //!!d Wait for the result to be back. //%% EXERCISE: Use hStreams_app_event_wait in stream 1, with &out_Event CHECK_HSTR_RESULT(hStreams_app_event_wait(1, &out_Event)); //-------------------------------------------------------------- // Check the value on the host side. printf("Average received on host-side, from the sink-side: %f\n", avg); diff = avg - (a + b) / 2; if (diff < 0) { diff = -diff; } if (diff > 0.001) { printf("Error. You need to sync on completion of remote function.\n"); } else { printf("Passed.\n"); } //-------------------------------------------------------------- // Cleanup before exiting. CHECK_HSTR_RESULT(hStreams_app_fini()); //-------------------------------------------------------------- }
int main(int argc, char **argv) { int dimX = 4; int dimY = 64; int dimZ = 64; int numIters = 64; // Process args from command line*/ int argi = 1; int errFlag = 0; while (argi < argc) { char *one = argv[argi]; if (!strcmp(one, "-d") && argc > argi + 3) { dimX = atoi(argv[argi + 1]); dimY = atoi(argv[argi + 2]); dimZ = atoi(argv[argi + 3]); argi += 4; } else if (!strcmp(one, "-n") && argc > argi + 1) { numIters = atoi(argv[argi + 1]); /* Make numIters a multiple of 16 */ numIters = numIters & (~0xf); argi += 2; } // Also take tile_size from command line*/ else if (!strcmp(one, "-t") && argc > argi + 1) { tile_size = atoi(argv[argi + 1]); if (dimY % tile_size != 0) { printf("The given tile_size does not divide dimY evenly!\n"); return -1; } argi += 2; } else { errFlag = 1; break; } } if (errFlag) { printf("Usage: %s [-d [the size of each dimension]] " "[-n [the number of iterations in the kernel]] " "[-t tile_size]\n", argv[0]); return -1; } if (dimY != dimZ) { printf("DimY is not the same as DimZ, " "changing dimZ to make them alike!\n"); dimZ = dimY; } if (dimX * dimY * dimZ * numIters % 64 != 0) { printf("The production of DimX, DimY, DimZ and " "#iters must be multiples of 64!\n"); return -1; } printf("DimX=%d, DimY=%d, DimZ=%d, #Loop_Iterations=%d, tile_size=%d\n", dimX, dimY, dimZ, numIters, tile_size); REAL *out = NULL; // 64 bytes aligned allocation. posix_memalign((void **) &out, 64, dimX * dimY * dimZ * numIters * sizeof(REAL) * 2); if (out == NULL) { printf("Memory allocation failed!\n"); return -1; } double iterTimes[NUM_TESTS_ITERS]; double mintime = 1e6; // Notice how the out1 and out2 pointers are calculated. REAL *out1 = out; REAL *out2 = out + dimX * dimY * dimZ * numIters; //-------------------------------------------------------------- // Initialize hStreams with the given StreamsPerDomain and // over-subscription level. CHECK_HSTR_RESULT(hStreams_app_init(streams_per_domain, oversubscription)); //-------------------------------------------------------------- //-------------------------------------------------------------- // Saving buffer length in a temporary variable to save computation. long long buffer_len = dimX * dimY * dimZ * numIters * sizeof(REAL) * 2; // Set up buffers. // Wrap out with a buffer. hStreams_app_create_buf((void *) out, buffer_len); register long int buff_length = dimY * dimZ * numIters * sizeof(REAL); //!!a Creating two dimX buffer addresses pointing inside out buffer. // out1_addr[i] points to the position of out1 buffer where // iteration i of the outer loop of the compute kernel writes. // out2_addr[i] points to the position of out2 buffer where // iteration i of the outer loop of the compute kernel writes. REAL *out1_addr[dimX]; REAL *out2_addr[dimX]; // Separation between out1 and out2. long int out_length = dimX * dimY * dimZ * numIters; for (int i = 0; i < dimX; i++) { //%% EXERCISE: go back to compute kernel and find out what is the value for // out1_addr[i] and out2_addr[i]. out1_addr[i] = out + i * dimY * dimZ * numIters; out2_addr[i] = out1_addr[i] + out_length; // You can also create buffer in this way instead of creating // out buffer as a whole // hStreams_app_create_buf((void *)out1_addr[i], buff_length); // hStreams_app_create_buf((void *)out2_addr[i], buff_length); } // Event pointers to support asynchronous function calls. HSTR_EVENT eout1[numIters][dimX], eout2[numIters][dimX], eout3[numIters][dimX], eout4[numIters][dimX], eout5[numIters][dimX]; //-------------------------------------------------------------- for (int i = 0; i < NUM_TESTS_ITERS; i++) { iterTimes[i] = GetTime(); //-------------------------------------------------------------- // Call device side API. // Prepare to perform computation on the sink-side. // Outer loop. // The body of the original i loop is enqueued on the streams in a round // robin fashion. for (int ii = 0; ii < dimX; ii++) { int stream = ii % streams_per_domain; //-------------------------------------------------------------- // Initialize data at the sink-side //!!b Initializes intermediate data at sink, only // the portion being worked on hStreams_app_memset(stream, out1_addr[ii], // source proxy address to write 0.5, buff_length, // number of bytes to send &eout4[i][ii]); // completion event //%% EXERCISE: Initialize out2_addr[ii] buffer similarly on the sink-side. hStreams_app_memset(stream, out2_addr[ii], // source proxy address to write 0.5, buff_length, // number of bytes to send &eout5[i][ii]); // completion event uint64_t args[8]; // Pack scalar arguments first, then heap args. //%% EXERCISE: Setup the heap args properly. args[0] = (uint64_t)(ii); args[1] = (uint64_t)(dimX); args[2] = (uint64_t)(dimY); args[3] = (uint64_t)(dimZ); args[4] = (uint64_t)(numIters); args[5] = (uint64_t)(tile_size); args[6] = (uint64_t)(out1_addr[ii]); args[7] = (uint64_t)(out2_addr[ii]); //-------------------------------------------------------------- hStreams_app_invoke(stream, // same idea "compute", // remote function name 6, // scalar arg 2, // heap args args, // array of args &eout1[i][ii], NULL, // return variable 0); //----------------------------------------------------------------- // Collect result. hStreams_app_xfer_memory(stream, out1_addr[ii], // source proxy address to write out1_addr[ii], // source proxy address to read buff_length, // number of bytes to send HSTR_SINK_TO_SRC, // transfer direction &eout2[i][ii]); // completion event //-------------------------------------------------------------- //!!c Transfer output data from sink to source //%% EXERCISE: Transfer data back for out2_addr[ii]. hStreams_app_xfer_memory(stream, out2_addr[ii], // source proxy address to write out2_addr[ii], // source proxy address to read buff_length, // number of bytes to send HSTR_SINK_TO_SRC, // transfer direction &eout3[i][ii]); // completion event } //-------------------------------------------------------------- // Synchronize. CHECK_HSTR_RESULT(hStreams_app_thread_sync()); //-------------------------------------------------------------- iterTimes[i] = GetTime() - iterTimes[i]; double result = out1[numIters / 2] + out2[numIters / 2]; printf("Test %d takes %.3lf ms with result %.3lf\n", i, iterTimes[i], result); if (iterTimes[i] < mintime) { mintime = iterTimes[i]; } } printf("Test's min time is %.3lf ms\n", mintime); //-------------------------------------------------------------- // Cleanup before exiting. CHECK_HSTR_RESULT(hStreams_app_fini()); //-------------------------------------------------------------- free(out); return 0; }
int main(int argc, char **argv) { HSTR_OPTIONS hstreams_options; hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS)); hstreams_options.verbose = 0; char *libNames[200] = {NULL, NULL}; //Library to be loaded for sink-side code libNames[0] = "cholesky_sink_1.so"; hstreams_options.libNameCnt = 1; hstreams_options.libNames = libNames; hstreams_options.libFlags = NULL; int mat_size_m, num_tiles, niter, tile_size; niter = 5; num_tiles = 1; mat_size_m = 0; //must be an input bool layRow = true; //max_log_str defines the no. of physical partitions on the card int max_log_str = 5; int verify = 1; hStreams_SetOptions(&hstreams_options); for (int i = 1; i < argc; i++) { if (*argv[i] == SWITCH_CHAR) { switch (*(argv[i] + 1)) { case 'm': mat_size_m = (int)atol(argv[i] + 3); break; case 't': num_tiles = (int)atol(argv[i] + 3); break; case 's': max_log_str = (int)atol(argv[i] + 3); break; case 'l': if ((strcmp("row", argv[i] + 3) == 0) || (strcmp("ROW", argv[i] + 3) == 0)) { layRow = true; printf("matrix is in Row major format\n"); } else { layRow = false; printf("matrix is in Col major format\n"); } break; case 'i': niter = (int)atol(argv[i] + 3); if (niter < 3) { niter = 3; } break; case 'v': verify = (int)atol(argv[i] + 3); break; default: break; } } } dtimeInit(); printf("no. of streams (partitions) = %d, mat_size = %d, num_tiles = %d," " niter = %d\n\n", max_log_str, mat_size_m, num_tiles, niter); //Check that mat_size is divisible by num_tiles if (mat_size_m % num_tiles != 0) { printf("matrix size MUST be divisible by num_tiles.. aborting\n"); exit(0); } if (mat_size_m == 0) { printf("mat_size_m is not defined\n"); exit(0); } tile_size = mat_size_m / num_tiles; //This allocates memory for the full input matrix double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double)); //Generate a symmetric positve-definite matrix A = dpo_generate(mat_size_m); //No. of PlacesPerDomain is same as no. of logical streams since LogStreamsPerPlace is 1. uint32_t PlacesPerDomain = max_log_str; uint32_t LogStreamsPerPlace = 1; hStreams_app_init(PlacesPerDomain, LogStreamsPerPlace); //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation. cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter, max_log_str, layRow, verify); hStreams_app_fini(); free(A); return 0; }