SEXP R_auto_cuMemsetD32Async(SEXP r_dstDevice, SEXP r_ui, SEXP r_N, SEXP r_hStream) { SEXP r_ans = R_NilValue; CUdeviceptr dstDevice = REAL(r_dstDevice)[0]; unsigned int ui = REAL(r_ui)[0]; size_t N = REAL(r_N)[0]; CUstream hStream = (CUstream) getRReference(r_hStream); CUresult ans; ans = cuMemsetD32Async(dstDevice, ui, N, hStream); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
/* // Feature map reduction in GPU // In each cell we reduce dimension of the feature vector // according to original paper special procedure // // API //int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps, CUstream *streams) // INPUT // numStep // bx // by // devs_map_in // streams // OUTPUT // feature_maps // RESULT // Error status */ int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps, CUstream *streams) { int sizeX, sizeY, pp; int size_map_pca; int i; CUresult res; CvLSVMFeatureMapGPU **devs_map_pca; pp = NUM_SECTOR * 3 + 4; devs_map_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX + 2 * bx; sizeY = devs_map_in[i]->sizeY + 2 * by; size_map_pca = sizeX * sizeY * pp; allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp); allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_pca[i]->sizeX; sizeY = devs_map_pca[i]->sizeY; size_map_pca = sizeX * sizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_pca)"); // launch kernel PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i], devs_map_pca[i], bx, by, streams[i]); } for (i = 0; i < numStep; i++) { sizeX = devs_map_pca[i]->sizeX; sizeY = devs_map_pca[i]->sizeY; size_map_pca = sizeX * sizeY * pp; // copy memory from device to host res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map, sizeof(float) * size_map_pca, streams[i]); CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)"); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_map_pca[i]); } free(devs_map_pca); return LATENT_SVM_OK; }
/* // Feature map Normalization and Truncation in GPU // // API //int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams) // INPUT // numStep // alfa // devs_map_in // streams // OUTPUT // devs_map_out // RESULT // Error status */ int normalizeAndTruncateGPUStream(const int numStep, const float alfa, CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out, CUstream *streams) { int sizeX, sizeY, newSizeX, newSizeY, pp; int size_norm, size_map_out; int i; CUresult res; CvLSVMFeatureMapGPU **devs_norm; pp = NUM_SECTOR * 12; devs_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1); } // exucute async for (i = 0; i < numStep; i++) { sizeX = devs_map_in[i]->sizeX; sizeY = devs_map_in[i]->sizeY; newSizeX = sizeX - 2; newSizeY = sizeY - 2; size_norm = sizeX * sizeY; size_map_out = newSizeX * newSizeY * pp; // initilize device memory value of 0 res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]); CUDA_CHECK(res, "cuMemset(dev_norm)"); res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map_out)"); // launch kernel calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]); } for (i = 0; i < numStep; i++) { // launch kernel normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i], streams[i]); } // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); } // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_norm[i]); } free(devs_norm); return LATENT_SVM_OK; }
/* // Getting feature map for the selected subimage in GPU // // API //int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams) // INPUT // numStep // k // devs_img // streams // OUTPUT // devs_map // RESULT // Error status */ int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams) { int sizeX, sizeY; int p, px; int height, width; int i, j; int *nearest; float *w, a_x, b_x; int size_r, size_alfa, size_nearest, size_w, size_map; CUresult res; CvLSVMFeatureMapGPU **devs_r, **devs_alfa; CUdeviceptr dev_nearest, dev_w; px = 3 * NUM_SECTOR; p = px; size_nearest = k; size_w = k * 2; devs_r = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); devs_alfa = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); nearest = (int *) malloc(sizeof(int) * size_nearest); w = (float *) malloc(sizeof(float) * size_w); // initialize "nearest" and "w" for (i = 0; i < k / 2; i++) { nearest[i] = -1; }/*for(i = 0; i < k / 2; i++)*/ for (i = k / 2; i < k; i++) { nearest[i] = 1; }/*for(i = k / 2; i < k; i++)*/ for (j = 0; j < k / 2; j++) { b_x = k / 2 + j + 0.5f; a_x = k / 2 - j - 0.5f; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = 0; j < k / 2; j++)*/ for (j = k / 2; j < k; j++) { a_x = j - k / 2 + 0.5f; b_x = -j + k / 2 - 0.5f + k; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = k / 2; j < k; j++)*/ res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest); CUDA_CHECK(res, "cuMemAlloc(dev_nearest)"); res = cuMemAlloc(&dev_w, sizeof(float) * size_w); CUDA_CHECK(res, "cuMemAlloc(dev_w)"); res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest, streams[numStep - 1]); res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w, streams[numStep - 1]); // allocate device memory for (i = 0; i < numStep; i++) { width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1); allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2); } // excute async for (i = 0; i < numStep; i++) { // initialize "map", "r" and "alfa" width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; sizeX = width / k; sizeY = height / k; size_map = sizeX * sizeY * p; size_r = width * height; size_alfa = width * height * 2; // initilize device memory value of 0 res = cuMemsetD32Async(devs_map[i]->map, 0, size_map, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map)"); res = cuMemsetD32Async(devs_r[i]->map, 0, size_r, streams[i]); CUDA_CHECK(res, "cuMemset(dev_r)"); res = cuMemsetD32Async(devs_alfa[i]->map, 0, size_alfa, streams[i]); CUDA_CHECK(res, "cuMemset(dev_alfa)"); // launch kernel calculateHistogramGPULaunch(k, devs_img[i], devs_r[i], devs_alfa[i], streams[i]); } for (i = 0; i < numStep; i++) { getFeatureMapsGPULaunch(k, devs_r[i], devs_alfa[i], &dev_nearest, &dev_w, devs_map[i], streams[i]); } // free device memory res = cuMemFree(dev_nearest); CUDA_CHECK(res, "cuMemFree(dev_nearest)"); res = cuMemFree(dev_w); CUDA_CHECK(res, "cuMemFree(dev_w)"); for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_r[i]); freeFeatureMapObjectGPU(&devs_alfa[i]); } free(nearest); free(w); free(devs_r); free(devs_alfa); return LATENT_SVM_OK; }