int cuda_test_fmmul(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; float *a = (float *) malloc (n*n * sizeof(float)); float *b = (float *) malloc (n*n * sizeof(float)); float *c = (float *) malloc (n*n * sizeof(float)); int block_x, block_y, grid_x, grid_y; int offset; char fname[256]; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; float exec; /* initialize A[] & B[] */ for (i = 0; i < n; i++) { for(j = 0; j < n; j++) { idx = i * n + j; a[idx] = i + 0.1; b[idx] = i + 0.1; } } /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; printf("block = (%d, %d)\n", block_x, block_y); printf("grid = (%d, %d)\n", grid_x, grid_y); gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 3); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/fmmul_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "_Z3mulPfS_S_i"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetSharedSize(function, 0x40); /* just random */ if (res != CUDA_SUCCESS) { printf("cuFuncSetSharedSize() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } gettimeofday(&tv_h2d_start, NULL); /* upload a[] and b[] */ res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_h2d_end, NULL); /* set kernel parameters */ offset = 0; res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(a_dev); res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(b_dev); res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(c_dev); res = cuParamSetv(function, offset, &n, sizeof(n)); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } offset += sizeof(n); res = cuParamSetSize(function, offset); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); /* check the results */ i = j = idx = 0; while (i < n) { while (j < n) { idx = i * n + j; if (c[idx] != a[idx] * b[idx]) { printf("c[%d] = %f\n", idx, c[idx]); printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]*b[idx]); return -1; } j++; } i++; } free(a); free(b); free(c); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); printf("Exec: %f\n", exec); printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec); printf("Total: %f\n", total); return 0; }
static void dispose(char *ptr) { cuda_check( cuMemFree(static_cast<CUdeviceptr>(reinterpret_cast<size_t>(ptr))) ); }
JNIEXPORT void JNICALL Java_org_trifort_rootbeer_runtime_CUDAContext_cudaRun (JNIEnv *env, jobject this_ref, jint device_index, jbyteArray cubin_file, jint cubin_length, jint block_shape_x, jint grid_shape_x, jint num_threads, jobject object_mem, jobject handles_mem, jobject exceptions_mem, jobject class_mem) { CUresult status; CUdevice device; CUcontext context; CUmodule module; CUfunction function; void * fatcubin; int offset; int info_space_size; CUdeviceptr gpu_info_space; CUdeviceptr gpu_object_mem; CUdeviceptr gpu_handles_mem; CUdeviceptr gpu_exceptions_mem; CUdeviceptr gpu_class_mem; CUdeviceptr gpu_heap_end; CUdeviceptr gpu_buffer_size; void * cpu_object_mem; void * cpu_handles_mem; void * cpu_exceptions_mem; void * cpu_class_mem; jlong cpu_object_mem_size; jlong cpu_handles_mem_size; jlong cpu_exceptions_mem_size; jlong cpu_class_mem_size; jlong cpu_heap_end; jclass cuda_memory_class; jmethodID get_address_method; jmethodID get_size_method; jmethodID get_heap_end_method; jlong * info_space; //---------------------------------------------------------------------------- //init device and function //---------------------------------------------------------------------------- status = cuDeviceGet(&device, device_index); CHECK_STATUS(env, "Error in cuDeviceGet", status, device) status = cuCtxCreate(&context, CU_CTX_MAP_HOST, device); CHECK_STATUS(env,"Error in cuCtxCreate", status, device) fatcubin = malloc(cubin_length); (*env)->GetByteArrayRegion(env, cubin_file, 0, cubin_length, fatcubin); status = cuModuleLoadFatBinary(&module, fatcubin); CHECK_STATUS(env, "Error in cuModuleLoad", status, device) free(fatcubin); status = cuModuleGetFunction(&function, module, "_Z5entryPcS_PiPxS1_S0_S0_i"); CHECK_STATUS(env, "Error in cuModuleGetFunction", status, device) //---------------------------------------------------------------------------- //get handles from java //---------------------------------------------------------------------------- cuda_memory_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/FixedMemory"); get_address_method = (*env)->GetMethodID(env, cuda_memory_class, "getAddress", "()J"); get_size_method = (*env)->GetMethodID(env, cuda_memory_class, "getSize", "()J"); get_heap_end_method = (*env)->GetMethodID(env, cuda_memory_class, "getHeapEndPtr", "()J"); cpu_object_mem = (void *) (*env)->CallLongMethod(env, object_mem, get_address_method); cpu_object_mem_size = (*env)->CallLongMethod(env, object_mem, get_size_method); cpu_heap_end = (*env)->CallLongMethod(env, object_mem, get_heap_end_method); cpu_handles_mem = (void *) (*env)->CallLongMethod(env, handles_mem, get_address_method); cpu_handles_mem_size = (*env)->CallLongMethod(env, handles_mem, get_size_method); cpu_exceptions_mem = (void *) (*env)->CallLongMethod(env, exceptions_mem, get_address_method); cpu_exceptions_mem_size = (*env)->CallLongMethod(env, exceptions_mem, get_size_method); cpu_class_mem = (void *) (*env)->CallLongMethod(env, class_mem, get_address_method); cpu_class_mem_size = (*env)->CallLongMethod(env, class_mem, get_size_method); info_space_size = 1024; info_space = (jlong *) malloc(info_space_size); info_space[1] = (*env)->CallLongMethod(env, object_mem, get_heap_end_method); //---------------------------------------------------------------------------- //allocate mem //---------------------------------------------------------------------------- status = cuMemAlloc(&gpu_info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_info_mem", status, device) status = cuMemAlloc(&gpu_object_mem, cpu_object_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_object_mem", status, device) status = cuMemAlloc(&gpu_handles_mem, cpu_handles_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_handles_mem", status, device) status = cuMemAlloc(&gpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_exceptions_mem", status, device) status = cuMemAlloc(&gpu_class_mem, cpu_class_mem_size); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_class_mem", status, device) status = cuMemAlloc(&gpu_heap_end, 8); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_heap_end", status, device) status = cuMemAlloc(&gpu_buffer_size, 8); CHECK_STATUS(env, "Error in cuMemAlloc: gpu_buffer_size", status, device) //---------------------------------------------------------------------------- //set function parameters //---------------------------------------------------------------------------- status = cuParamSetSize(function, (7 * sizeof(CUdeviceptr) + sizeof(int))); CHECK_STATUS(env, "Error in cuParamSetSize", status, device) offset = 0; status = cuParamSetv(function, offset, (void *) &gpu_info_space, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv gpu_info_space", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_object_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_object_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_handles_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_handles_mem %", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_heap_end, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_heap_end", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_buffer_size, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_buffer_size", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_exceptions_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_exceptions_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSetv(function, offset, (void *) &gpu_class_mem, sizeof(CUdeviceptr)); CHECK_STATUS(env, "Error in cuParamSetv: gpu_class_mem", status, device) offset += sizeof(CUdeviceptr); status = cuParamSeti(function, offset, num_threads); CHECK_STATUS(env, "Error in cuParamSetv: num_threads", status, device) offset += sizeof(int); //---------------------------------------------------------------------------- //copy data //---------------------------------------------------------------------------- status = cuMemcpyHtoD(gpu_info_space, info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: info_space", status, device) status = cuMemcpyHtoD(gpu_object_mem, cpu_object_mem, cpu_object_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_object_mem", status, device) status = cuMemcpyHtoD(gpu_handles_mem, cpu_handles_mem, cpu_handles_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_handles_mem", status, device) status = cuMemcpyHtoD(gpu_class_mem, cpu_class_mem, cpu_class_mem_size); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_class_mem", status, device) status = cuMemcpyHtoD(gpu_heap_end, &cpu_heap_end, sizeof(jlong)); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_heap_end", status, device) status = cuMemcpyHtoD(gpu_buffer_size, &cpu_object_mem_size, sizeof(jlong)); CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_buffer_size", status, device) status = cuMemcpyHtoD(gpu_exceptions_mem, cpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device) //---------------------------------------------------------------------------- //launch //---------------------------------------------------------------------------- status = cuFuncSetBlockShape(function, block_shape_x, 1, 1); CHECK_STATUS(env, "Error in cuFuncSetBlockShape", status, device); status = cuLaunchGrid(function, grid_shape_x, 1); CHECK_STATUS(env, "Error in cuLaunchGrid", status, device) status = cuCtxSynchronize(); CHECK_STATUS(env, "Error in cuCtxSynchronize", status, device) //---------------------------------------------------------------------------- //copy data back //---------------------------------------------------------------------------- status = cuMemcpyDtoH(info_space, gpu_info_space, info_space_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_info_space", status, device) cpu_heap_end = info_space[1]; status = cuMemcpyDtoH(cpu_object_mem, gpu_object_mem, cpu_heap_end); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_object_mem", status, device) status = cuMemcpyDtoH(cpu_exceptions_mem, gpu_exceptions_mem, cpu_exceptions_mem_size); CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device) //---------------------------------------------------------------------------- //free resources //---------------------------------------------------------------------------- free(info_space); cuMemFree(gpu_info_space); cuMemFree(gpu_object_mem); cuMemFree(gpu_handles_mem); cuMemFree(gpu_exceptions_mem); cuMemFree(gpu_class_mem); cuMemFree(gpu_heap_end); cuMemFree(gpu_buffer_size); cuCtxDestroy(context); }
int cuda_test_mmul_vmmap_hybrid(unsigned int n, char *path) { int i, j, idx; CUresult res; CUdevice dev; CUcontext ctx; CUfunction function; CUmodule module; CUdeviceptr a_dev, b_dev, c_dev; unsigned int *a_buf, *b_buf, *c_buf; unsigned long long int a_phys, b_phys, c_phys; unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int)); int block_x, block_y, grid_x, grid_y; char fname[256]; int ret = 0; struct timeval tv; struct timeval tv_total_start, tv_total_end; float total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; struct timeval tv_exec_start, tv_exec_end; struct timeval tv_mem_alloc_start; struct timeval tv_data_init_start; float data_init; struct timeval tv_conf_kern_start; struct timeval tv_close_start; float mem_alloc; float exec; float init_gpu; float configure_kernel; float close_gpu; float data_read; unsigned int dummy_b, dummy_c; /* block_x * block_y should not exceed 512. */ block_x = n < 16 ? n : 16; block_y = n < 16 ? n : 16; grid_x = n / block_x; if (n % block_x != 0) grid_x++; grid_y = n / block_y; if (n % block_y != 0) grid_y++; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return -1; } sprintf(fname, "%s/mmul_gpu.cubin", path); res = cuModuleLoad(&module, fname); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); return -1; } res = cuModuleGetFunction(&function, module, "multiply"); if (res != CUDA_SUCCESS) { printf("cuModuleGetFunction() failed\n"); return -1; } res = cuFuncSetBlockShape(function, block_x, block_y, 1); if (res != CUDA_SUCCESS) { printf("cuFuncSetBlockShape() failed\n"); return -1; } gettimeofday(&tv_mem_alloc_start, NULL); /* a[] */ res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (a) failed\n"); return -1; } res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (a) failed\n"); return -1; } res = cuMemGetPhysAddr(&a_phys, (void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (a) failed\n"); return -1; } /*printf("a[]: Physical Address 0x%llx\n", a_phys);*/ /* b[] */ res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (b) failed\n"); return -1; } res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (b) failed\n"); return -1; } res = cuMemGetPhysAddr(&b_phys, (void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (b) failed\n"); return -1; } /*printf("b[]: Physical Address 0x%llx\n", b_phys);*/ /* c[] */ res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc (c) failed\n"); return -1; } res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemMap (c) failed\n"); return -1; } res = cuMemGetPhysAddr(&c_phys, (void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemGetPhysAddress (c) failed\n"); return -1; } /*printf("c[]: Physical Address 0x%llx\n", c_phys);*/ gettimeofday(&tv_data_init_start, NULL); /* initialize A[] & B[] */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { a_buf[idx++] = i; } } for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { b_buf[idx++] = i; } } gettimeofday(&tv_h2d_start, NULL); gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_conf_kern_start, NULL); /* set kernel parameters */ res = cuParamSeti(function, 0, a_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 4, a_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 8, b_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 12, b_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 16, c_dev); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 20, c_dev >> 32); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSeti(function, 24, n); if (res != CUDA_SUCCESS) { printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuParamSetSize(function, 28); if (res != CUDA_SUCCESS) { printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_exec_start, NULL); /* launch the kernel */ res = cuLaunchGrid(function, grid_x, grid_y); if (res != CUDA_SUCCESS) { printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res); return -1; } cuCtxSynchronize(); gettimeofday(&tv_exec_end, NULL); gettimeofday(&tv_d2h_start, NULL); /* download c[] */ res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(unsigned int)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_d2h_end, NULL); /* Read back */ for (i = 0; i < n; i++) { idx = i*n; for(j = 0; j < n; j++) { dummy_c = c[idx++]; } } gettimeofday(&tv_close_start, NULL); res = cuMemUnmap((void*)a_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(a_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)b_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(b_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemUnmap((void*)c_buf); if (res != CUDA_SUCCESS) { printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuMemFree(c_dev); if (res != CUDA_SUCCESS) { printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res); return -1; } res = cuModuleUnload(module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_mem_alloc_start, &tv_total_start, &tv); init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv); mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_start, &tv_data_init_start, &tv); data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_start, &tv_conf_kern_start, &tv); configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_exec_end, &tv_exec_start, &tv); exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_close_start, &tv_d2h_end, &tv); data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_close_start, &tv); close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; printf("Init: %f\n", init_gpu); printf("MemAlloc: %f\n", mem_alloc); printf("DataInit: %f\n", data_init); printf("HtoD: %f\n", h2d); printf("KernConf: %f\n", configure_kernel); printf("Exec: %f\n", exec); printf("DtoH: %f\n", d2h); printf("DataRead: %f\n", data_read); printf("Close: %f\n", close_gpu); printf("Total: %f\n", total); return ret; }
int main (int argc, char *argv[]) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index = 0; func_ret_t ret; const char *input_file = NULL; const char *cubin_file = NULL; float *m, *mm; struct timeval tv; CUdeviceptr d_m; CUcontext ctx; CUmodule mod; CUresult res; while ((opt = getopt_long(argc, argv, "::vs:i:c:", long_options, &option_index)) != -1 ) { switch(opt) { case 'c': cubin_file = optarg; break; case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); fprintf(stderr, "Currently not supported, use -i instead\n"); fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n", argv[0]); exit(EXIT_FAILURE); case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n", argv[0]); exit(EXIT_FAILURE); } if (!cubin_file) { printf("No cubin file specified!\n"); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify){ print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } /* * call our common CUDA initialization utility function. */ res = cuda_driver_api_init(&ctx, &mod, cubin_file); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_init failed: res = %u\n", res); return -1; } res = cuMemAlloc(&d_m, matrix_dim * matrix_dim * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed\n"); return -1; } /* * measurement start! */ time_measure_start(&tv); res = cuMemcpyHtoD(d_m, m, matrix_dim * matrix_dim * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %u\n", res); return -1; } lud_launch(mod, d_m, matrix_dim); res = cuMemcpyDtoH(m, d_m, matrix_dim * matrix_dim * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH failed: res = %u\n", res); return -1; } /* * measurement end! will print out the time. */ time_measure_end(&tv); res = cuMemFree(d_m); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", res); return -1; } res = cuda_driver_api_exit(ctx, mod); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_exit faild: res = %u\n", res); return -1; } if (do_verify){ print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); lud_verify(mm, m, matrix_dim); free(mm); } free(m); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */
void bpnn_train_cuda(BPNN *net, float *eo, float *eh) { int j, k; int in, hid, out; float out_err, hid_err; struct timeval tv; in = net->input_n; hid = net->hidden_n; out = net->output_n; #ifdef GPU int m = 0; float *partial_sum; float sum; float *input_weights_one_dim; float *input_weights_prev_one_dim; num_blocks = in / 16; CUdeviceptr input_cuda; CUdeviceptr input_hidden_cuda; CUdeviceptr output_hidden_cuda; CUdeviceptr hidden_partial_sum; CUdeviceptr hidden_delta_cuda; CUdeviceptr input_prev_weights_cuda; CUcontext ctx; CUmodule mod; CUresult res; input_weights_one_dim = (float *) malloc((in + 1) * (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1) * (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); /* this preprocessing stage is added to correct the bugs of wrong memcopy using two-dimensional net->inputweights */ for (k = 0; k <= in; k++) { for (j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } /* * call our common CUDA initialization utility function. */ res = cuda_driver_api_init(&ctx, &mod, "./backprop.cubin"); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_init failed: res = %u\n", res); return ; } /* * allocate device memory space */ res = cuMemAlloc(&input_cuda, (in + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&output_hidden_cuda, (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&hidden_delta_cuda, (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } #endif #ifdef CPU printf("Performing CPU computation\n"); bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid); #endif #ifdef GPU printf("Performing GPU computation\n"); //printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks); /* * measurement start! */ time_measure_start(&tv); res = cuMemcpyHtoD(input_cuda, net->input_units, (in + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = bpnn_layerforward_launch(mod, input_cuda, output_hidden_cuda, input_hidden_cuda, hidden_partial_sum, in, hid); if (res != CUDA_SUCCESS) { printf("bpnn_layerforward failed: res = %u\n", res); return ; } cuCtxSynchronize(); #if 0 cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) { printf("bpnn kernel error: %s\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } #endif res = cuMemcpyDtoH(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(layerforward) failed: res = %u\n", res); return ; } for (j = 1; j <= hid; j++) { sum = 0.0; for (k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = (float) (1.0 / (1.0 + exp(-sum))); } #endif bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); #ifdef CPU bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights); #endif #ifdef GPU res = cuMemcpyHtoD(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = cuMemcpyHtoD(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = bpnn_adjust_weights_launch(mod, hidden_delta_cuda, hid, input_cuda, in, input_hidden_cuda, input_prev_weights_cuda); if (res != CUDA_SUCCESS) { printf("bpnn_adjust_weights failed: res = %u\n", res); return ; } res = cuMemcpyDtoH(net->input_units, input_cuda, (in + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res); return ; } res = cuMemcpyDtoH(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res); return ; } cuMemFree(input_cuda); cuMemFree(output_hidden_cuda); cuMemFree(input_hidden_cuda); cuMemFree(hidden_partial_sum); cuMemFree(input_prev_weights_cuda); cuMemFree(hidden_delta_cuda); /* * measurement end! will print out the time. */ time_measure_end(&tv); res = cuda_driver_api_exit(ctx, mod); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_exit faild: res = %u\n", res); return ; } free(partial_sum); free(input_weights_one_dim); free(input_weights_prev_one_dim); #endif }
T run_function(const std::string& name, const T input, const int shiftValue) { const std::string test_source = "//\n" "// Generated by NVIDIA NVVM Compiler\n" "//\n" "// Compiler Build ID: CL-19856038\n" "// Cuda compilation tools, release 7.5, V7.5.17\n" "// Based on LLVM 3.4svn\n" "//\n" "\n" ".version 4.3\n" ".target sm_20\n" ".address_size 64\n" "\n" " // .globl _Z10kernel_s32Piii\n" "\n" ".visible .entry _Z10kernel_s32Piii(\n" " .param .u64 _Z10kernel_s32Piii_param_0,\n" " .param .u32 _Z10kernel_s32Piii_param_1,\n" " .param .u32 _Z10kernel_s32Piii_param_2\n" ")\n" "{\n" " .reg .b32 %r<4>;\n" " .reg .b64 %rd<3>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_s32Piii_param_0];\n" " ld.param.u32 %r1, [_Z10kernel_s32Piii_param_1];\n" " ld.param.u32 %r2, [_Z10kernel_s32Piii_param_2];\n" " cvta.to.global.u64 %rd2, %rd1;\n" " shr.s32 %r3, %r1, %r2;\n" " st.global.u32 [%rd2], %r3;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_s64Pxxi\n" ".visible .entry _Z10kernel_s64Pxxi(\n" " .param .u64 _Z10kernel_s64Pxxi_param_0,\n" " .param .u64 _Z10kernel_s64Pxxi_param_1,\n" " .param .u32 _Z10kernel_s64Pxxi_param_2\n" ")\n" "{\n" " .reg .b32 %r<2>;\n" " .reg .b64 %rd<5>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_s64Pxxi_param_0];\n" " ld.param.u64 %rd2, [_Z10kernel_s64Pxxi_param_1];\n" " ld.param.u32 %r1, [_Z10kernel_s64Pxxi_param_2];\n" " cvta.to.global.u64 %rd3, %rd1;\n" " shr.s64 %rd4, %rd2, %r1;\n" " st.global.u64 [%rd3], %rd4;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_u32Pjji\n" ".visible .entry _Z10kernel_u32Pjji(\n" " .param .u64 _Z10kernel_u32Pjji_param_0,\n" " .param .u32 _Z10kernel_u32Pjji_param_1,\n" " .param .u32 _Z10kernel_u32Pjji_param_2\n" ")\n" "{\n" " .reg .b32 %r<4>;\n" " .reg .b64 %rd<3>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_u32Pjji_param_0];\n" " ld.param.u32 %r1, [_Z10kernel_u32Pjji_param_1];\n" " ld.param.u32 %r2, [_Z10kernel_u32Pjji_param_2];\n" " cvta.to.global.u64 %rd2, %rd1;\n" " shr.u32 %r3, %r1, %r2;\n" " st.global.u32 [%rd2], %r3;\n" " ret;\n" "}\n" "\n" " // .globl _Z10kernel_u64Pyyi\n" ".visible .entry _Z10kernel_u64Pyyi(\n" " .param .u64 _Z10kernel_u64Pyyi_param_0,\n" " .param .u64 _Z10kernel_u64Pyyi_param_1,\n" " .param .u32 _Z10kernel_u64Pyyi_param_2\n" ")\n" "{\n" " .reg .b32 %r<2>;\n" " .reg .b64 %rd<5>;\n" "\n" "\n" " ld.param.u64 %rd1, [_Z10kernel_u64Pyyi_param_0];\n" " ld.param.u64 %rd2, [_Z10kernel_u64Pyyi_param_1];\n" " ld.param.u32 %r1, [_Z10kernel_u64Pyyi_param_2];\n" " cvta.to.global.u64 %rd3, %rd1;\n" " shr.u64 %rd4, %rd2, %r1;\n" " st.global.u64 [%rd3], %rd4;\n" " ret;\n" "}\n" "\n" "\n" ; CUmodule modId = 0; CUfunction funcHandle = 0; cu_assert(cuModuleLoadData(&modId, test_source.c_str())); cu_assert(cuModuleGetFunction(&funcHandle, modId, name.c_str())); T output; CUdeviceptr devOutput; cu_assert(cuMemAlloc(&devOutput, sizeof(output))); void * params[] = {&devOutput, (void*)&input, (void*)&shiftValue}; auto result = cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr); cu_assert(result); cu_assert(cuMemcpyDtoH(&output, devOutput, sizeof(output))); cu_assert(cuMemFree(devOutput)); cu_assert(cuModuleUnload(modId)); return output; }
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; int d = 0; if (argc < 6 || argc > 7) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (argc > 6) { if (sscanf(argv[6], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } } srand(0); float complex alpha, beta, * A, * B, * C, * refC; CUdeviceptr dA, dB, dC, dD; size_t lda, ldb, ldc, dlda, dldb, dldc, dldd; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; if (transA == CBlasNoTrans) { lda = (m + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), m * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * m * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), k * sizeof(float complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } if (transB == CBlasNoTrans) { ldb = (k + 1u) & ~1u; if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), k * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { ldb = (n + 1u) & ~1u; if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), n * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (m + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex))); dldc /= sizeof(float complex); CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex))); dldd /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); float rdiff = 0.0f, idiff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i])); if (d > rdiff) rdiff = d; d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i])); if (d > idiff) idiff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = k * 6 + (k - 1) * 2; // k multiplies and k - 1 adds per element if (alpha != 1.0f + 0.0f * I) flops += 6; // additional multiply by alpha if (beta != 0.0f + 0.0f * I) flops += 8; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (rdiff <= error) && (idiff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuMemFree(dD)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }