int main(int argc, char *argv[])
{
    //%% EXERCISE: Take note of this return argument name
    double avg = -0.1, diff;  // Variable to hold the average of the scalars.
    HSTR_EVENT out_Event; // Tracks completion of sink-side computation

    //--------------------------------------------------------------
    // Set up initialization parameters
    // Number of streams to create in each domain: 1
    int streams_per_domain = 1;
    // no over-subscription, i.e., one logical stream per physical stream.
    int oversubscription = 1;
    CHECK_HSTR_RESULT(hStreams_app_init(streams_per_domain, oversubscription));
    //--------------------------------------------------------------

    //--------------------------------------------------------------
    //!!a Set up 2 scalar arguments instead of one, no heap arguments
    const int num_scalar_values_to_send = 2;
    const int num_heap_values_to_send   = 0;

    // 1st scalar value to pass to sink side.
    int a = 1000;
    // 2nd scalar value to pass to sink side.
    int b = 500;
    //%% Optional: Add additional variables as a practice.

    // You can pass these values from command line.
    if (argc > 1) {
        a = atoi(argv[1]);
    }
    if (argc > 2) {
        b = atoi(argv[2]);
        //%% Optional: add more, if you like
    }

    // Arguments to pass to sink side. You need to pass two arguments.
    uint64_t args[num_scalar_values_to_send + num_heap_values_to_send];

    // Pack the first argument.
    args[0] = (uint64_t)(a);
    //Pack the second argument.
    args[1] = (uint64_t)(b);
    //%% Optional: Pack other arguments you want to send.
    //--------------------------------------------------------------

    //--------------------------------------------------------------
    // Invoke the "average_sink_1" app on the sink side on stream 0.
    // NOTE: sizes should match to the actual allocated size, otherwise you will get segfaults
    CHECK_HSTR_RESULT(hStreams_app_invoke(
                          0,                             // stream_id = 0
                          "average_sink_1", // sink-side function name
                          num_scalar_values_to_send,     //!!a Changed above, as a variable
                          num_heap_values_to_send,       // number of heap arguments
                          args,                          // arguments array
                          &out_Event,                    // completion event
                          //  used to check result availability
                          //%% EXERCISE: replace <return variable> with the name of the return arg
                          &avg,                          //!!b Point to the blob to return
                          sizeof(double)));              //!!b Give the blob size to return
    //--------------------------------------------------------------

    //--------------------------------------------------------------
    //!!d Wait for the result to be back.
    //%% EXERCISE: Use hStreams_app_event_wait in stream 1, with &out_Event
    CHECK_HSTR_RESULT(hStreams_app_event_wait(1, &out_Event));
    //--------------------------------------------------------------

    // Check the value on the host side.
    printf("Average received on host-side, from the sink-side: %f\n", avg);
    diff = avg - (a + b) / 2;
    if (diff < 0) {
        diff = -diff;
    }
    if (diff > 0.001) {
        printf("Error.  You need to sync on completion of remote function.\n");
    } else {
        printf("Passed.\n");
    }

    //--------------------------------------------------------------
    // Cleanup before exiting.
    CHECK_HSTR_RESULT(hStreams_app_fini());
    //--------------------------------------------------------------
}
int main(int argc, char **argv)
{
    int dimX = 4;
    int dimY = 64;
    int dimZ = 64;
    int numIters = 64;

    // Process args from command line*/
    int argi = 1;
    int errFlag = 0;
    while (argi < argc) {
        char *one = argv[argi];
        if (!strcmp(one, "-d") && argc > argi + 3) {
            dimX = atoi(argv[argi + 1]);
            dimY = atoi(argv[argi + 2]);
            dimZ = atoi(argv[argi + 3]);
            argi += 4;
        } else if (!strcmp(one, "-n") && argc > argi + 1) {
            numIters = atoi(argv[argi + 1]);
            /* Make numIters a multiple of 16 */
            numIters = numIters & (~0xf);
            argi += 2;
        }
        // Also take tile_size from command line*/
        else if (!strcmp(one, "-t") && argc > argi + 1) {
            tile_size = atoi(argv[argi + 1]);
            if (dimY % tile_size != 0) {
                printf("The given tile_size does not divide dimY evenly!\n");
                return -1;
            }
            argi += 2;
        } else {
            errFlag = 1;
            break;
        }
    }
    if (errFlag) {
        printf("Usage: %s [-d [the size of each dimension]] "
               "[-n [the number of iterations in the kernel]] "
               "[-t tile_size]\n", argv[0]);
        return -1;
    }

    if (dimY != dimZ) {
        printf("DimY is not the same as DimZ, "
               "changing dimZ to make them alike!\n");
        dimZ = dimY;
    }

    if (dimX * dimY * dimZ * numIters % 64 != 0) {
        printf("The production of DimX, DimY, DimZ and "
               "#iters must be multiples of 64!\n");
        return -1;
    }

    printf("DimX=%d, DimY=%d, DimZ=%d, #Loop_Iterations=%d, tile_size=%d\n",
           dimX, dimY, dimZ, numIters, tile_size);

    REAL *out = NULL;

    // 64 bytes aligned allocation.
    posix_memalign((void **) &out, 64,
                   dimX * dimY * dimZ * numIters * sizeof(REAL) * 2);

    if (out == NULL) {
        printf("Memory allocation failed!\n");
        return -1;
    }

    double iterTimes[NUM_TESTS_ITERS];
    double mintime = 1e6;

    // Notice how the out1 and out2 pointers are calculated.
    REAL *out1 = out;
    REAL *out2 = out + dimX * dimY * dimZ * numIters;

    //--------------------------------------------------------------

    // Initialize hStreams with the given StreamsPerDomain and
    // over-subscription level.
    CHECK_HSTR_RESULT(hStreams_app_init(streams_per_domain, oversubscription));

    //--------------------------------------------------------------

    //--------------------------------------------------------------
    // Saving buffer length in a temporary variable to save computation.
    long long buffer_len = dimX * dimY * dimZ * numIters
                           * sizeof(REAL) * 2;

    // Set up buffers.
    // Wrap out with a buffer.
    hStreams_app_create_buf((void *) out, buffer_len);

    register long int buff_length = dimY * dimZ * numIters * sizeof(REAL);

    //!!a Creating two dimX buffer addresses pointing inside out buffer.
    // out1_addr[i] points to the position of out1 buffer where
    // iteration i of the outer loop of the compute kernel writes.
    // out2_addr[i] points to the position of out2 buffer where
    // iteration i of the outer loop of the compute kernel writes.
    REAL *out1_addr[dimX];
    REAL *out2_addr[dimX];

    // Separation between out1 and out2.
    long int out_length = dimX * dimY * dimZ * numIters;

    for (int i = 0; i < dimX; i++) {
        //%% EXERCISE: go back to compute kernel and find out what is the value for
        //   out1_addr[i] and out2_addr[i].
        out1_addr[i] = out + i * dimY * dimZ * numIters;
        out2_addr[i] = out1_addr[i] + out_length;
        //  You can also create buffer in this way instead of creating
        //   out buffer as a whole
        //  hStreams_app_create_buf((void *)out1_addr[i], buff_length);
        //  hStreams_app_create_buf((void *)out2_addr[i], buff_length);
    }

    // Event pointers to support asynchronous function calls.
    HSTR_EVENT eout1[numIters][dimX], eout2[numIters][dimX],
               eout3[numIters][dimX], eout4[numIters][dimX], eout5[numIters][dimX];

    //--------------------------------------------------------------

    for (int i = 0; i < NUM_TESTS_ITERS; i++) {
        iterTimes[i] = GetTime();
        //--------------------------------------------------------------
        // Call device side API.
        // Prepare to perform computation on the sink-side.
        // Outer loop.
        // The body of the original i loop is enqueued on the streams in a round
        // robin fashion.
        for (int ii = 0; ii < dimX; ii++) {


            int stream = ii % streams_per_domain;
            //--------------------------------------------------------------
            // Initialize data at the sink-side

            //!!b  Initializes intermediate data at sink, only
            // the portion  being worked on
            hStreams_app_memset(stream, out1_addr[ii], // source proxy address to write
                                0.5, buff_length, // number of bytes to send
                                &eout4[i][ii]); // completion event

            //%% EXERCISE: Initialize out2_addr[ii] buffer similarly on the sink-side.
            hStreams_app_memset(stream, out2_addr[ii], // source proxy address to write
                                0.5, buff_length, // number of bytes to send
                                &eout5[i][ii]); // completion event


            uint64_t args[8];

            // Pack scalar arguments first, then heap args.
            //%% EXERCISE: Setup the heap args properly.
            args[0] = (uint64_t)(ii);
            args[1] = (uint64_t)(dimX);
            args[2] = (uint64_t)(dimY);
            args[3] = (uint64_t)(dimZ);
            args[4] = (uint64_t)(numIters);
            args[5] = (uint64_t)(tile_size);
            args[6] = (uint64_t)(out1_addr[ii]);
            args[7] = (uint64_t)(out2_addr[ii]);
            //--------------------------------------------------------------

            hStreams_app_invoke(stream, // same idea
                                "compute", // remote function name
                                6, // scalar arg
                                2, // heap args
                                args, // array of args
                                &eout1[i][ii], NULL, // return variable
                                0);

            //-----------------------------------------------------------------
            // Collect result.
            hStreams_app_xfer_memory(stream, out1_addr[ii], // source proxy address to write
                                     out1_addr[ii], // source proxy address to read
                                     buff_length, // number of bytes to send
                                     HSTR_SINK_TO_SRC, // transfer direction
                                     &eout2[i][ii]); // completion event

            //--------------------------------------------------------------
            //!!c Transfer output data from sink to source
            //%% EXERCISE: Transfer data back for out2_addr[ii].
            hStreams_app_xfer_memory(stream, out2_addr[ii], // source proxy address to write
                                     out2_addr[ii], // source proxy address to read
                                     buff_length, // number of bytes to send
                                     HSTR_SINK_TO_SRC, // transfer direction
                                     &eout3[i][ii]); // completion event
        }
        //--------------------------------------------------------------

        // Synchronize.
        CHECK_HSTR_RESULT(hStreams_app_thread_sync());
        //--------------------------------------------------------------

        iterTimes[i] = GetTime() - iterTimes[i];

        double result = out1[numIters / 2] + out2[numIters / 2];
        printf("Test %d takes %.3lf ms with result %.3lf\n", i, iterTimes[i],
               result);
        if (iterTimes[i] < mintime) {
            mintime = iterTimes[i];
        }
    }
    printf("Test's min time is %.3lf ms\n", mintime);

    //--------------------------------------------------------------
    // Cleanup before exiting.
    CHECK_HSTR_RESULT(hStreams_app_fini());
    //--------------------------------------------------------------
    free(out);
    return 0;
}
コード例 #3
0
int main(int argc, char **argv)
{
    HSTR_OPTIONS hstreams_options;
    hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS));

    hstreams_options.verbose = 0;
    char *libNames[200] = {NULL, NULL};

    //Library to be loaded for sink-side code
    libNames[0] = "cholesky_sink_1.so";
    hstreams_options.libNameCnt = 1;
    hstreams_options.libNames = libNames;
    hstreams_options.libFlags = NULL;

    int mat_size_m, num_tiles, niter, tile_size;
    niter = 5;
    num_tiles = 1;
    mat_size_m = 0; //must be an input
    bool layRow = true;

    //max_log_str defines the no. of physical partitions on the card
    int max_log_str = 5;

    int verify = 1;

    hStreams_SetOptions(&hstreams_options);
    for (int i = 1; i < argc; i++) {
        if (*argv[i] == SWITCH_CHAR) {
            switch (*(argv[i] + 1)) {
            case 'm':
                mat_size_m = (int)atol(argv[i] + 3);
                break;

            case 't':
                num_tiles = (int)atol(argv[i] + 3);
                break;

            case 's':
                max_log_str = (int)atol(argv[i] + 3);
                break;

            case 'l':
                if ((strcmp("row", argv[i] + 3) == 0) ||
                        (strcmp("ROW", argv[i] + 3) == 0)) {
                    layRow = true;
                    printf("matrix is in Row major format\n");
                } else {
                    layRow = false;
                    printf("matrix is in Col major format\n");
                }
                break;

            case 'i':
                niter = (int)atol(argv[i] + 3);
                if (niter < 3) {
                    niter = 3;
                }
                break;

            case 'v':
                verify = (int)atol(argv[i] + 3);
                break;

            default:
                break;
            }
        }
    }
    dtimeInit();

    printf("no. of streams (partitions) = %d, mat_size = %d, num_tiles = %d,"
           " niter = %d\n\n", max_log_str, mat_size_m, num_tiles, niter);

    //Check that mat_size is divisible by num_tiles
    if (mat_size_m % num_tiles != 0) {
        printf("matrix size MUST be divisible by num_tiles.. aborting\n");
        exit(0);
    }

    if (mat_size_m == 0) {
        printf("mat_size_m is not defined\n");
        exit(0);
    }

    tile_size = mat_size_m / num_tiles;

    //This allocates memory for the full input matrix
    double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double));

    //Generate a symmetric positve-definite matrix
    A = dpo_generate(mat_size_m);

    //No. of PlacesPerDomain is same as no. of logical streams since LogStreamsPerPlace is 1.
    uint32_t PlacesPerDomain = max_log_str;
    uint32_t LogStreamsPerPlace = 1;
    hStreams_app_init(PlacesPerDomain, LogStreamsPerPlace);

    //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation.
    cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter,
                   max_log_str, layRow, verify);

    hStreams_app_fini();

    free(A);

    return 0;
}