void sobel1(int *h_result, unsigned int *h_pic, int xsize, int ysize, int thresh) { int *d_result; unsigned int *d_pic; int resultSize = xsize * ysize * 3 * sizeof(int); int picSize = xsize * ysize * sizeof(int); cudaMalloc( (void**)&d_result, resultSize); if( !d_result) { exit(-1); } cudaMalloc( (void**)&d_pic, picSize); if( !d_pic) { exit(-1); } cudaMemcpy(d_result, h_result, resultSize, cudaMemcpyHostToDevice); cudaMemcpy(d_pic, h_pic, picSize, cudaMemcpyHostToDevice); dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE); dim3 numBlocks(ceil((float)ysize/(float)threadsPerBlock.x), ceil((float)xsize/(float)threadsPerBlock.y)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); { __set_CUDAConfig(numBlocks, threadsPerBlock ); d_sobel1 (d_result, d_pic, xsize, ysize, thresh);} cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaMemcpy(h_result, d_result, resultSize, cudaMemcpyDeviceToHost); cudaMemcpy(h_pic, d_pic, picSize, cudaMemcpyDeviceToHost); cudaFree(d_result); cudaFree(d_pic); }
int main(int argc, char* argv[]) { unsigned int *host_array = (unsigned int*) malloc(SIZE*sizeof(unsigned int)); unsigned int *device_array_a = 0; cudaMalloc((void **) &device_array_a, SIZE*sizeof(unsigned int)); unsigned int *device_array_b = 0; cudaMalloc((void **) &device_array_b, SIZE*sizeof(unsigned int)); if (host_array == 0) { return 1;} if (device_array_a == 0) { return 2;} if (device_array_b == 0) { return 3;} for (int i=0; i<SIZE; i++) { if (i%2 == 0) { host_array[i] = i; } else { host_array[i] = 0; } } cudaMemcpy(device_array_a, host_array, SIZE, cudaMemcpyHostToDevice); for (int i=0; i<SIZE; i++) { if (i%2 == 0 && i%3 == 0) { host_array[i] = i; } else { host_array[i] = 0; } } cudaMemcpy(device_array_b, host_array, SIZE, cudaMemcpyHostToDevice); { __set_CUDAConfig(BLOCKS, (SIZE/BLOCKS)); device_global(device_array_a, device_array_b, SIZE);} cudaMemcpy(host_array, device_array_a, SIZE, cudaMemcpyDeviceToHost); for (int i=0; i<SIZE; i += SIZE/BLOCKS) { for (int j=0; j<SIZE/BLOCKS; j++){ printf("%d, ", host_array[i+j]); } printf("\n"); } free(host_array); cudaFree(device_array_a); cudaFree(device_array_b); }
int main() { int* din; cudaMalloc((void**)&din, N*sizeof(int)); int in[N]; for(int i = 0; i < N; i++) in[i] = 0; cudaMemcpy(din, &in, N*sizeof(int), cudaMemcpyHostToDevice); { __set_CUDAConfig(1, N); iwarp(din);} int output[N]; cudaMemcpy(&output, din, N*sizeof(int), cudaMemcpyDeviceToHost); for(int i = 0; i < N; i++) printf("%d ", output[i]); printf("\n"); }
int main(int argc, char **argv) { int block_size = 32; dim3 dimsA(1*1*block_size, 1*1*block_size, 1); dim3 dimsB(1*2*block_size, 1*1*block_size, 1); unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(int) * size_A; int *h_A = (int *)malloc(mem_size_A); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(int) * size_B; int *h_B = (int *)malloc(mem_size_B); const float valB = 0.01f; constantInit(h_A, size_A, 1); constantInit(h_B, size_B, 1); int *d_A, *d_B, *d_C; dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(int); int *h_C = (int *) malloc(mem_size_C); if (h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); exit(EXIT_FAILURE); } cudaError_t error; error = cudaMalloc((void **) &d_A, mem_size_A); if (error != cudaSuccess) { printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_B, mem_size_B); if (error != cudaSuccess) { printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_C, mem_size_C); if (error != cudaSuccess) { printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } dim3 threads(block_size, block_size); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); printf("Computing result using CUDA Kernel...\n"); if (block_size == 16) { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } else { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } printf("done\n"); cudaDeviceSynchronize(); cudaEvent_t start; error = cudaEventCreate(&start); if (error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } cudaEvent_t stop; error = cudaEventCreate(&stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } error = cudaEventRecord(start, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } int nIter = 300; for (int j = 0; j < nIter; j++) { if (block_size == 16) { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<16>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } else { { __set_CUDAConfig(grid, threads ); matrixMulCUDA<32>(d_C, d_A, d_B, dimsA.x, dimsB.x);} } } error = cudaEventRecord(stop, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } error = cudaEventSynchronize(stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); if (error != cudaSuccess) { printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } printf("Checking computed result for correctness: "); bool correct = true; double eps = 1.e-6 ; for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) { double abs_err = fabs(h_C[i] - (dimsA.x * valB)); double dot_length = dimsA.x; double abs_val = fabs(h_C[i]); double rel_err = abs_err/abs_val/dot_length ; if (rel_err > eps) { printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); correct = false; } } printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); free(h_A); free(h_B); free(h_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n"); cudaDeviceReset(); if (correct) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } }