int main(int argc, char **argv)
{
    HSTR_OPTIONS hstreams_options;
    CHECK_HSTR_RESULT(hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS)));

    char *libNames[200] = {NULL, NULL};

    //Library to be loaded for sink-side code
    libNames[0] = "cholesky_sink_1.so";
    hstreams_options.libNameCnt = 1;
    hstreams_options.libNames = libNames;
    hstreams_options.libFlags = NULL;

    hstreams_options.libNameCntHost = 0;
    hstreams_options.libNamesHost = NULL;

    int mat_size_m, num_tiles, niter, tile_size;
    niter = 5;
    num_tiles = 1;
    mat_size_m = 0; //must be an input
    bool layRow = true;

    //max_log_str defines the no. of physical partitions on the card
    int use_host = 1, num_mics = 1;
    int nstreams_host = 4, nstreams_mic = 4;

    int verify = 1;

    CHECK_HSTR_RESULT(hStreams_SetOptions(&hstreams_options));
    for (int i = 1; i < argc; i++) {
        if (*argv[i] == SWITCH_CHAR) {
            switch (*(argv[i] + 1)) {
            case 'm':
                mat_size_m = (int)atol(argv[i] + 3);
                break;

            case 't':
                num_tiles = (int)atol(argv[i] + 3);
                break;

            case 's':
                nstreams_mic = (int)atol(argv[i] + 3);
                break;

            case 'l':
                if ((strcmp("row", argv[i] + 3) == 0) ||
                        (strcmp("ROW", argv[i] + 3) == 0)) {
                    layRow = true;
                    printf("matrix is in Row major format\n");
                } else {
                    layRow = false;
                    printf("matrix is in Col major format\n");
                }
                break;

            case 'i':
                niter = (int)atol(argv[i] + 3);
                //if( niter < 3 ) niter=3;
                break;

            case 'h':
                use_host = (int)atol(argv[i] + 3);
                break;

            case 'c':
                num_mics = (int)atol(argv[i] + 3);
                break;

            case 'v':
                verify = (int)atol(argv[i] + 3);
                break;

            default:
                break;
            }
        }
    }
    dtimeInit();

    //Check that mat_size is divisible by num_tiles
    if (mat_size_m % num_tiles != 0) {
        printf("matrix size MUST be divisible by num_tiles.. aborting\n");
        exit(0);
    }

    if (mat_size_m == 0) {
        printf("mat_size_m is not defined\n");
        exit(0);
    }

    tile_size = mat_size_m / num_tiles;

    //This allocates memory for the full input matrix
    double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double));

    //Generate a symmetric positve-definite matrix
    A = dpo_generate(mat_size_m);

    int num_doms = use_host + num_mics;
    int max_log_str;

    if (use_host == 0 && num_mics == 0) {
        printf("Cannot run if not using either host or MIC cards\n");
        exit(-1);
    }

    if (use_host == 1) {
        printf("Using the host CPU for compute.. and\n");
    }
    printf("Using %d MIC cards for compute..\n", num_mics);

    if (use_host == 1 && num_mics >= 1) {
#ifdef HOST_HT_ON
        nstreams_host = 2 * nstreams_mic;
#else
        nstreams_host = nstreams_mic;
#endif
        max_log_str = nstreams_mic;
    } else if (num_mics == 0) {
        nstreams_host = nstreams_mic;
        max_log_str = nstreams_host;
#ifdef HOST_HT_ON
        nstreams_host = 2 * nstreams_host;
#endif
    } else if (use_host == 0) {
        max_log_str = nstreams_mic;
    }

    int host_ht_offset = 0;

#ifdef HOST_HT_ON
    host_ht_offset = nstreams_host - max_log_str;
#endif

    if (use_host) {
        printf("number of streams used on host = %d\n", nstreams_host);
        if (loc_verbose) {
            printf("if HT is enabled on host, only top half streams will be used\n");
            printf("if number of streams on host do not evenly divide with number of cores, performance can suffer\n");
        }
    }
    if (num_mics >= 1) {
        printf("number of streams used on mic = %d\n", nstreams_mic);
    }

    if (use_host == 1) {
        resv_cpu_master = 1;
        mach_wide_league = 1;
    } else {
        resv_cpu_master = 0;
        mach_wide_league = 0;
    }

    HSTR_PHYS_DOM *physDomID = new HSTR_PHYS_DOM[num_doms];
    HSTR_LOG_DOM *logDomID = new HSTR_LOG_DOM[num_doms + 1]; //+1 for creating a machine wide stream

    HSTR_CPU_MASK out_CPUmask, src_hstr_cpu_mask;
    HSTR_PHYS_DOM *out_pPhysDomainID = new HSTR_PHYS_DOM;
    HSTR_OVERLAP_TYPE *out_pOverlap = new HSTR_OVERLAP_TYPE;

    uint32_t *places = new uint32_t[num_doms];

    for (int i = 0; i < num_doms; ++i) {
        if (i == 0) {
            if (use_host == 1) {
                places[i] = nstreams_host;
                physDomID[i] = -1;
            } else {
                places[i] = nstreams_mic;
                physDomID[i] = i;
            }
        } else {
            places[i] = nstreams_mic;
            if (use_host == 1) {
                physDomID[i] = i - 1;
            } else {
                physDomID[i] = i;
            }
        }
    }

    if (resv_cpu_master) {
        HostCPUMask host_cpu_mask;
        host_cpu_mask.cpu_zero();

        for (int i = 0; i < num_resv_cpus; ++i) {
            host_cpu_mask.cpu_set(resv_cpus[i]);
        }

        int ret;
        HSTR_CPU_MASK_ZERO(src_hstr_cpu_mask);
        setCurrentProcessAffinityMask(host_cpu_mask);
        getCurrentProcessAffinityMask(host_cpu_mask);
        int first, last, num_set;
        last = 0;
        first = HSTR_MAX_THREADS;
        num_set = 0;
        for (int i = 0; i < HSTR_MAX_THREADS; i++) {
            if (host_cpu_mask.cpu_isset(i)) {
                if (i < first) {
                    first = i;
                }
                last = i;
                num_set++;
                HSTR_CPU_MASK_SET(i, src_hstr_cpu_mask);
            }
        }
        if (loc_verbose) {
            printf("Reserving the following cpu_set for master on CPU\n");
            ShowLimitCPUmask(src_hstr_cpu_mask);
        }
    }

    uint32_t str_offset = 0;
    uint32_t places_mach_wide = 1;
    int iret;

    //create a machine wide stream on host for potrf
    if (mach_wide_league) {
        if (resv_cpu_master) {
            iret = hStreams_custom_init_selected_domains(
                       1,
                       physDomID,
                       1,
                       &places_mach_wide,
                       1,
                       str_offset,
                       src_hstr_cpu_mask);
        } else {
            iret = hStreams_app_init_selected_domains(
                       1,
                       physDomID,
                       1,
                       &places_mach_wide,
                       1,
                       str_offset);
        }
        str_offset = 1;
    }

    //create rest of the streams
    if (resv_cpu_master) {
        iret = hStreams_custom_init_selected_domains(
                   num_doms,
                   physDomID,
                   num_doms,
                   places,
                   1,
                   str_offset,
                   src_hstr_cpu_mask);
    } else {
        iret = hStreams_app_init_selected_domains(
                   num_doms,
                   physDomID,
                   num_doms,
                   places,
                   1,
                   str_offset);
    }
    if (iret != 0) {
        printf("hstreams_app_init failed!\r\n");
        exit(-1);
    }

    mkl_mic_disable();
    //10 max streams for printout
    HSTR_LOG_STR *out_pLogStreamIDs = new HSTR_LOG_STR[10];

    if (loc_verbose) {
        if (mach_wide_league) {
            //host
            CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[0], 2, &logDomID[0]));
            for (int idom = 0; idom < 2; ++idom) {
                CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask));
                //ShowLimitCPUmask(out_CPUmask);
                if (idom == 0) {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], 1, out_pLogStreamIDs));
                } else {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[0], out_pLogStreamIDs));
                }
                if (idom > 0) {
                    for (int i = 0; i < places[0]; ++i) {
                        CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask));
                        printf("streamId = %d\n", (int)out_pLogStreamIDs[i]);
                        ShowLimitCPUmask(out_CPUmask);
                    }
                } else {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[0], logDomID[idom], out_CPUmask));
                    printf("streamId = %d\n", (int)out_pLogStreamIDs[0]);
                    ShowLimitCPUmask(out_CPUmask);
                }
            }

            for (int idom = 1; idom < num_doms; ++idom) {
                CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[idom], 1, &logDomID[idom]));
                CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask));
                //ShowLimitCPUmask(out_CPUmask);
                CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[idom], out_pLogStreamIDs));
                for (int i = 0; i < places[idom]; ++i) {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask));
                    printf("streamId = %d\n", (int)out_pLogStreamIDs[i]);
                    ShowLimitCPUmask(out_CPUmask);
                }
            }
        } else {
            for (int idom = 0; idom < num_doms; ++idom) {
                CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[idom], 1, &logDomID[idom]));
                CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask));
                //ShowLimitCPUmask(out_CPUmask);
                CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[idom], out_pLogStreamIDs));
                for (int i = 0; i < places[idom]; ++i) {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask));
                    printf("streamId = %d\n", (int)out_pLogStreamIDs[i]);
                    ShowLimitCPUmask(out_CPUmask);
                }
            }
        }
    }

    //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation.
    int cholesky_code = cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter,
                                       max_log_str, layRow, verify, num_doms, use_host, num_mics, host_ht_offset);

    CHECK_HSTR_RESULT(hStreams_app_fini());

    free(A);

    return cholesky_code;
}
int main(int argc, char **argv)
{
    HSTR_OPTIONS hstreams_options;
    hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS));

    hstreams_options.verbose = 0;
    char *libNames[200] = {NULL, NULL};

    //Library to be loaded for sink-side code
    libNames[0] = "cholesky_sink_1.so";
    hstreams_options.libNameCnt = 1;
    hstreams_options.libNames = libNames;
    hstreams_options.libFlags = NULL;

    int mat_size_m, num_tiles, niter, tile_size;
    niter = 5;
    num_tiles = 1;
    mat_size_m = 0; //must be an input
    bool layRow = true;

    //max_log_str defines the no. of physical partitions on the card
    int max_log_str = 5;

    int verify = 1;

    hStreams_SetOptions(&hstreams_options);
    for (int i = 1; i < argc; i++) {
        if (*argv[i] == SWITCH_CHAR) {
            switch (*(argv[i] + 1)) {
            case 'm':
                mat_size_m = (int)atol(argv[i] + 3);
                break;

            case 't':
                num_tiles = (int)atol(argv[i] + 3);
                break;

            case 's':
                max_log_str = (int)atol(argv[i] + 3);
                break;

            case 'l':
                if ((strcmp("row", argv[i] + 3) == 0) ||
                        (strcmp("ROW", argv[i] + 3) == 0)) {
                    layRow = true;
                    printf("matrix is in Row major format\n");
                } else {
                    layRow = false;
                    printf("matrix is in Col major format\n");
                }
                break;

            case 'i':
                niter = (int)atol(argv[i] + 3);
                if (niter < 3) {
                    niter = 3;
                }
                break;

            case 'v':
                verify = (int)atol(argv[i] + 3);
                break;

            default:
                break;
            }
        }
    }
    dtimeInit();

    printf("no. of streams (partitions) = %d, mat_size = %d, num_tiles = %d,"
           " niter = %d\n\n", max_log_str, mat_size_m, num_tiles, niter);

    //Check that mat_size is divisible by num_tiles
    if (mat_size_m % num_tiles != 0) {
        printf("matrix size MUST be divisible by num_tiles.. aborting\n");
        exit(0);
    }

    if (mat_size_m == 0) {
        printf("mat_size_m is not defined\n");
        exit(0);
    }

    tile_size = mat_size_m / num_tiles;

    //This allocates memory for the full input matrix
    double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double));

    //Generate a symmetric positve-definite matrix
    A = dpo_generate(mat_size_m);

    //No. of PlacesPerDomain is same as no. of logical streams since LogStreamsPerPlace is 1.
    uint32_t PlacesPerDomain = max_log_str;
    uint32_t LogStreamsPerPlace = 1;
    hStreams_app_init(PlacesPerDomain, LogStreamsPerPlace);

    //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation.
    cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter,
                   max_log_str, layRow, verify);

    hStreams_app_fini();

    free(A);

    return 0;
}