int main(int argc, char **argv) { HSTR_OPTIONS hstreams_options; CHECK_HSTR_RESULT(hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS))); char *libNames[200] = {NULL, NULL}; //Library to be loaded for sink-side code libNames[0] = "cholesky_sink_1.so"; hstreams_options.libNameCnt = 1; hstreams_options.libNames = libNames; hstreams_options.libFlags = NULL; hstreams_options.libNameCntHost = 0; hstreams_options.libNamesHost = NULL; int mat_size_m, num_tiles, niter, tile_size; niter = 5; num_tiles = 1; mat_size_m = 0; //must be an input bool layRow = true; //max_log_str defines the no. of physical partitions on the card int use_host = 1, num_mics = 1; int nstreams_host = 4, nstreams_mic = 4; int verify = 1; CHECK_HSTR_RESULT(hStreams_SetOptions(&hstreams_options)); for (int i = 1; i < argc; i++) { if (*argv[i] == SWITCH_CHAR) { switch (*(argv[i] + 1)) { case 'm': mat_size_m = (int)atol(argv[i] + 3); break; case 't': num_tiles = (int)atol(argv[i] + 3); break; case 's': nstreams_mic = (int)atol(argv[i] + 3); break; case 'l': if ((strcmp("row", argv[i] + 3) == 0) || (strcmp("ROW", argv[i] + 3) == 0)) { layRow = true; printf("matrix is in Row major format\n"); } else { layRow = false; printf("matrix is in Col major format\n"); } break; case 'i': niter = (int)atol(argv[i] + 3); //if( niter < 3 ) niter=3; break; case 'h': use_host = (int)atol(argv[i] + 3); break; case 'c': num_mics = (int)atol(argv[i] + 3); break; case 'v': verify = (int)atol(argv[i] + 3); break; default: break; } } } dtimeInit(); //Check that mat_size is divisible by num_tiles if (mat_size_m % num_tiles != 0) { printf("matrix size MUST be divisible by num_tiles.. aborting\n"); exit(0); } if (mat_size_m == 0) { printf("mat_size_m is not defined\n"); exit(0); } tile_size = mat_size_m / num_tiles; //This allocates memory for the full input matrix double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double)); //Generate a symmetric positve-definite matrix A = dpo_generate(mat_size_m); int num_doms = use_host + num_mics; int max_log_str; if (use_host == 0 && num_mics == 0) { printf("Cannot run if not using either host or MIC cards\n"); exit(-1); } if (use_host == 1) { printf("Using the host CPU for compute.. and\n"); } printf("Using %d MIC cards for compute..\n", num_mics); if (use_host == 1 && num_mics >= 1) { #ifdef HOST_HT_ON nstreams_host = 2 * nstreams_mic; #else nstreams_host = nstreams_mic; #endif max_log_str = nstreams_mic; } else if (num_mics == 0) { nstreams_host = nstreams_mic; max_log_str = nstreams_host; #ifdef HOST_HT_ON nstreams_host = 2 * nstreams_host; #endif } else if (use_host == 0) { max_log_str = nstreams_mic; } int host_ht_offset = 0; #ifdef HOST_HT_ON host_ht_offset = nstreams_host - max_log_str; #endif if (use_host) { printf("number of streams used on host = %d\n", nstreams_host); if (loc_verbose) { printf("if HT is enabled on host, only top half streams will be used\n"); printf("if number of streams on host do not evenly divide with number of cores, performance can suffer\n"); } } if (num_mics >= 1) { printf("number of streams used on mic = %d\n", nstreams_mic); } if (use_host == 1) { resv_cpu_master = 1; mach_wide_league = 1; } else { resv_cpu_master = 0; mach_wide_league = 0; } HSTR_PHYS_DOM *physDomID = new HSTR_PHYS_DOM[num_doms]; HSTR_LOG_DOM *logDomID = new HSTR_LOG_DOM[num_doms + 1]; //+1 for creating a machine wide stream HSTR_CPU_MASK out_CPUmask, src_hstr_cpu_mask; HSTR_PHYS_DOM *out_pPhysDomainID = new HSTR_PHYS_DOM; HSTR_OVERLAP_TYPE *out_pOverlap = new HSTR_OVERLAP_TYPE; uint32_t *places = new uint32_t[num_doms]; for (int i = 0; i < num_doms; ++i) { if (i == 0) { if (use_host == 1) { places[i] = nstreams_host; physDomID[i] = -1; } else { places[i] = nstreams_mic; physDomID[i] = i; } } else { places[i] = nstreams_mic; if (use_host == 1) { physDomID[i] = i - 1; } else { physDomID[i] = i; } } } if (resv_cpu_master) { HostCPUMask host_cpu_mask; host_cpu_mask.cpu_zero(); for (int i = 0; i < num_resv_cpus; ++i) { host_cpu_mask.cpu_set(resv_cpus[i]); } int ret; HSTR_CPU_MASK_ZERO(src_hstr_cpu_mask); setCurrentProcessAffinityMask(host_cpu_mask); getCurrentProcessAffinityMask(host_cpu_mask); int first, last, num_set; last = 0; first = HSTR_MAX_THREADS; num_set = 0; for (int i = 0; i < HSTR_MAX_THREADS; i++) { if (host_cpu_mask.cpu_isset(i)) { if (i < first) { first = i; } last = i; num_set++; HSTR_CPU_MASK_SET(i, src_hstr_cpu_mask); } } if (loc_verbose) { printf("Reserving the following cpu_set for master on CPU\n"); ShowLimitCPUmask(src_hstr_cpu_mask); } } uint32_t str_offset = 0; uint32_t places_mach_wide = 1; int iret; //create a machine wide stream on host for potrf if (mach_wide_league) { if (resv_cpu_master) { iret = hStreams_custom_init_selected_domains( 1, physDomID, 1, &places_mach_wide, 1, str_offset, src_hstr_cpu_mask); } else { iret = hStreams_app_init_selected_domains( 1, physDomID, 1, &places_mach_wide, 1, str_offset); } str_offset = 1; } //create rest of the streams if (resv_cpu_master) { iret = hStreams_custom_init_selected_domains( num_doms, physDomID, num_doms, places, 1, str_offset, src_hstr_cpu_mask); } else { iret = hStreams_app_init_selected_domains( num_doms, physDomID, num_doms, places, 1, str_offset); } if (iret != 0) { printf("hstreams_app_init failed!\r\n"); exit(-1); } mkl_mic_disable(); //10 max streams for printout HSTR_LOG_STR *out_pLogStreamIDs = new HSTR_LOG_STR[10]; if (loc_verbose) { if (mach_wide_league) { //host CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[0], 2, &logDomID[0])); for (int idom = 0; idom < 2; ++idom) { CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask)); //ShowLimitCPUmask(out_CPUmask); if (idom == 0) { CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], 1, out_pLogStreamIDs)); } else { CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[0], out_pLogStreamIDs)); } if (idom > 0) { for (int i = 0; i < places[0]; ++i) { CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask)); printf("streamId = %d\n", (int)out_pLogStreamIDs[i]); ShowLimitCPUmask(out_CPUmask); } } else { CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[0], logDomID[idom], out_CPUmask)); printf("streamId = %d\n", (int)out_pLogStreamIDs[0]); ShowLimitCPUmask(out_CPUmask); } } for (int idom = 1; idom < num_doms; ++idom) { CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[idom], 1, &logDomID[idom])); CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask)); //ShowLimitCPUmask(out_CPUmask); CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[idom], out_pLogStreamIDs)); for (int i = 0; i < places[idom]; ++i) { CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask)); printf("streamId = %d\n", (int)out_pLogStreamIDs[i]); ShowLimitCPUmask(out_CPUmask); } } } else { for (int idom = 0; idom < num_doms; ++idom) { CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[idom], 1, &logDomID[idom])); CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask)); //ShowLimitCPUmask(out_CPUmask); CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[idom], out_pLogStreamIDs)); for (int i = 0; i < places[idom]; ++i) { CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask)); printf("streamId = %d\n", (int)out_pLogStreamIDs[i]); ShowLimitCPUmask(out_CPUmask); } } } } //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation. int cholesky_code = cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter, max_log_str, layRow, verify, num_doms, use_host, num_mics, host_ht_offset); CHECK_HSTR_RESULT(hStreams_app_fini()); free(A); return cholesky_code; }
int main(int argc, char **argv) { HSTR_OPTIONS hstreams_options; hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS)); hstreams_options.verbose = 0; char *libNames[200] = {NULL, NULL}; //Library to be loaded for sink-side code libNames[0] = "cholesky_sink_1.so"; hstreams_options.libNameCnt = 1; hstreams_options.libNames = libNames; hstreams_options.libFlags = NULL; int mat_size_m, num_tiles, niter, tile_size; niter = 5; num_tiles = 1; mat_size_m = 0; //must be an input bool layRow = true; //max_log_str defines the no. of physical partitions on the card int max_log_str = 5; int verify = 1; hStreams_SetOptions(&hstreams_options); for (int i = 1; i < argc; i++) { if (*argv[i] == SWITCH_CHAR) { switch (*(argv[i] + 1)) { case 'm': mat_size_m = (int)atol(argv[i] + 3); break; case 't': num_tiles = (int)atol(argv[i] + 3); break; case 's': max_log_str = (int)atol(argv[i] + 3); break; case 'l': if ((strcmp("row", argv[i] + 3) == 0) || (strcmp("ROW", argv[i] + 3) == 0)) { layRow = true; printf("matrix is in Row major format\n"); } else { layRow = false; printf("matrix is in Col major format\n"); } break; case 'i': niter = (int)atol(argv[i] + 3); if (niter < 3) { niter = 3; } break; case 'v': verify = (int)atol(argv[i] + 3); break; default: break; } } } dtimeInit(); printf("no. of streams (partitions) = %d, mat_size = %d, num_tiles = %d," " niter = %d\n\n", max_log_str, mat_size_m, num_tiles, niter); //Check that mat_size is divisible by num_tiles if (mat_size_m % num_tiles != 0) { printf("matrix size MUST be divisible by num_tiles.. aborting\n"); exit(0); } if (mat_size_m == 0) { printf("mat_size_m is not defined\n"); exit(0); } tile_size = mat_size_m / num_tiles; //This allocates memory for the full input matrix double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double)); //Generate a symmetric positve-definite matrix A = dpo_generate(mat_size_m); //No. of PlacesPerDomain is same as no. of logical streams since LogStreamsPerPlace is 1. uint32_t PlacesPerDomain = max_log_str; uint32_t LogStreamsPerPlace = 1; hStreams_app_init(PlacesPerDomain, LogStreamsPerPlace); //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation. cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter, max_log_str, layRow, verify); hStreams_app_fini(); free(A); return 0; }