Example #1
0
SEXP R_auto_cuMemsetD32Async(SEXP r_dstDevice, SEXP r_ui, SEXP r_N, SEXP r_hStream)
{
    SEXP r_ans = R_NilValue;
    CUdeviceptr dstDevice = REAL(r_dstDevice)[0];
    unsigned int ui = REAL(r_ui)[0];
    size_t N = REAL(r_N)[0];
    CUstream hStream = (CUstream) getRReference(r_hStream);
    
    CUresult ans;
    ans = cuMemsetD32Async(dstDevice, ui, N, hStream);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
/*
// Feature map reduction in GPU
// In each cell we reduce dimension of the feature vector
// according to original paper special procedure
//
// API
//int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,
          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,
          CUstream *streams)
// INPUT
// numStep
// bx
// by
// devs_map_in
// streams
// OUTPUT
// feature_maps
// RESULT
// Error status
*/
int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,
        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,
        CUstream *streams)
{

    int sizeX, sizeY, pp;
    int size_map_pca;
    int i;
    CUresult res;
    CvLSVMFeatureMapGPU **devs_map_pca;

    pp = NUM_SECTOR * 3 + 4;

    devs_map_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));

    // allocate memory
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX + 2 * bx;
        sizeY = devs_map_in[i]->sizeY + 2 * by;

        size_map_pca = sizeX * sizeY * pp;

        allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp);
        allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp);
    }

    // exucute async
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_pca[i]->sizeX;
        sizeY = devs_map_pca[i]->sizeY;
        size_map_pca = sizeX * sizeY * pp;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca,
                streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map_pca)");

        // launch kernel
        PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i],
                devs_map_pca[i], bx, by, streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_pca[i]->sizeX;
        sizeY = devs_map_pca[i]->sizeY;
        size_map_pca = sizeX * sizeY * pp;

        // copy memory from device to host
        res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map,
                sizeof(float) * size_map_pca, streams[i]);
        CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)");
    }

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_map_pca[i]);
    }

    free(devs_map_pca);

    return LATENT_SVM_OK;
}
/*
// Feature map Normalization and Truncation in GPU
//
// API
//int normalizeAndTruncateGPUStream(const int numStep, const float alfa,
          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,
          CUstream *streams)
// INPUT
// numStep
// alfa
// devs_map_in
// streams
// OUTPUT
// devs_map_out
// RESULT
// Error status
*/
int normalizeAndTruncateGPUStream(const int numStep, const float alfa,
        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,
        CUstream *streams)
{

    int sizeX, sizeY, newSizeX, newSizeY, pp;
    int size_norm, size_map_out;
    int i;
    CUresult res;
    CvLSVMFeatureMapGPU **devs_norm;

    pp = NUM_SECTOR * 12;

    devs_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX;
        sizeY = devs_map_in[i]->sizeY;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1);
    }

    // exucute async
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX;
        sizeY = devs_map_in[i]->sizeY;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;
        size_norm = sizeX * sizeY;
        size_map_out = newSizeX * newSizeY * pp;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_norm)");
        res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out,
                streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map_out)");

        // launch kernel
        calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]);

    }

    for (i = 0; i < numStep; i++)
    {
        // launch kernel
        normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i],
                streams[i]);
    }

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
    }

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_norm[i]);
    }

    free(devs_norm);

    return LATENT_SVM_OK;
}
/*
// Getting feature map for the selected subimage in GPU
//
// API
//int getFeatureMapsGPUStream(const int numStep, const int k,
          CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map,
          CUstream *streams)
// INPUT
// numStep
// k
// devs_img
// streams
// OUTPUT
// devs_map
// RESULT
// Error status
*/
int getFeatureMapsGPUStream(const int numStep, const int k,
        CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map,
        CUstream *streams)
{
    int sizeX, sizeY;
    int p, px;
    int height, width;
    int i, j;

    int *nearest;
    float *w, a_x, b_x;

    int size_r, size_alfa, size_nearest, size_w, size_map;

    CUresult res;
    CvLSVMFeatureMapGPU **devs_r, **devs_alfa;
    CUdeviceptr dev_nearest, dev_w;

    px = 3 * NUM_SECTOR;
    p = px;

    size_nearest = k;
    size_w = k * 2;

    devs_r = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * numStep);
    devs_alfa = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * numStep);
    nearest = (int *) malloc(sizeof(int) * size_nearest);
    w = (float *) malloc(sizeof(float) * size_w);

    // initialize "nearest" and "w"
    for (i = 0; i < k / 2; i++)
    {
        nearest[i] = -1;
    }/*for(i = 0; i < k / 2; i++)*/
    for (i = k / 2; i < k; i++)
    {
        nearest[i] = 1;
    }/*for(i = k / 2; i < k; i++)*/

    for (j = 0; j < k / 2; j++)
    {
        b_x = k / 2 + j + 0.5f;
        a_x = k / 2 - j - 0.5f;
        w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x));
        w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x));
    }/*for(j = 0; j < k / 2; j++)*/
    for (j = k / 2; j < k; j++)
    {
        a_x = j - k / 2 + 0.5f;
        b_x = -j + k / 2 - 0.5f + k;
        w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x));
        w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x));
    }/*for(j = k / 2; j < k; j++)*/

    res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest);
    CUDA_CHECK(res, "cuMemAlloc(dev_nearest)");
    res = cuMemAlloc(&dev_w, sizeof(float) * size_w);
    CUDA_CHECK(res, "cuMemAlloc(dev_w)");

    res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest,
            streams[numStep - 1]);
    res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w,
            streams[numStep - 1]);

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        width = devs_img[i]->sizeX;
        height = devs_img[i]->sizeY;

        allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1);
        allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2);
    }

    // excute async
    for (i = 0; i < numStep; i++)
    {
        // initialize "map", "r" and "alfa"
        width = devs_img[i]->sizeX;
        height = devs_img[i]->sizeY;
        sizeX = width / k;
        sizeY = height / k;
        size_map = sizeX * sizeY * p;
        size_r = width * height;
        size_alfa = width * height * 2;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_map[i]->map, 0, size_map, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map)");
        res = cuMemsetD32Async(devs_r[i]->map, 0, size_r, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_r)");
        res = cuMemsetD32Async(devs_alfa[i]->map, 0, size_alfa, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_alfa)");

        // launch kernel
        calculateHistogramGPULaunch(k, devs_img[i], devs_r[i], devs_alfa[i],
                streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        getFeatureMapsGPULaunch(k, devs_r[i], devs_alfa[i], &dev_nearest,
                &dev_w, devs_map[i], streams[i]);
    }

    // free device memory
    res = cuMemFree(dev_nearest);
    CUDA_CHECK(res, "cuMemFree(dev_nearest)");
    res = cuMemFree(dev_w);
    CUDA_CHECK(res, "cuMemFree(dev_w)");

    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_r[i]);
        freeFeatureMapObjectGPU(&devs_alfa[i]);
    }

    free(nearest);
    free(w);
    free(devs_r);
    free(devs_alfa);

    return LATENT_SVM_OK;
}