Exemple #1
0
int cuda_test_fmmul(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	float *a = (float *) malloc (n*n * sizeof(float));
	float *b = (float *) malloc (n*n * sizeof(float));
	float *c = (float *) malloc (n*n * sizeof(float));
	int block_x, block_y, grid_x, grid_y;
	int offset;
	char fname[256];
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	float exec;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i + 0.1;
			b[idx] = i + 0.1;
		}
	}

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;
	printf("block = (%d, %d)\n", block_x, block_y);
	printf("grid = (%d, %d)\n", grid_x, grid_y);

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 3);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/fmmul_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3mulPfS_S_i");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetSharedSize(function, 0x40); /* just random */
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetSharedSize() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}

	gettimeofday(&tv_h2d_start, NULL);
	/* upload a[] and b[] */
	res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	/* set kernel parameters */
	offset = 0;
	res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev));	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(a_dev);
	res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(b_dev);
	res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(c_dev);
	res = cuParamSetv(function, offset, &n, sizeof(n));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(n);
	res = cuParamSetSize(function, offset);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	/* check the results */
	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] * b[idx]) {
				printf("c[%d] = %f\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]*b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	free(a);
	free(b);
	free(c);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);
	printf("Exec: %f\n", exec);
	printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec);
	printf("Total: %f\n", total);

	return 0;
}
Exemple #2
0
 static void dispose(char *ptr) {
     cuda_check( cuMemFree(static_cast<CUdeviceptr>(reinterpret_cast<size_t>(ptr))) );
 }
Exemple #3
0
JNIEXPORT void JNICALL Java_org_trifort_rootbeer_runtime_CUDAContext_cudaRun
  (JNIEnv *env, jobject this_ref, jint device_index, jbyteArray cubin_file, 
   jint cubin_length, jint block_shape_x, jint grid_shape_x, jint num_threads, 
   jobject object_mem, jobject handles_mem, jobject exceptions_mem, 
   jobject class_mem)
{
  CUresult status;
  CUdevice device;
  CUcontext context;
  CUmodule module;
  CUfunction function;
  void * fatcubin;
  int offset;
  int info_space_size;

  CUdeviceptr gpu_info_space;
  CUdeviceptr gpu_object_mem;
  CUdeviceptr gpu_handles_mem;
  CUdeviceptr gpu_exceptions_mem;
  CUdeviceptr gpu_class_mem;
  CUdeviceptr gpu_heap_end;
  CUdeviceptr gpu_buffer_size;

  void * cpu_object_mem;
  void * cpu_handles_mem;
  void * cpu_exceptions_mem;
  void * cpu_class_mem;
  jlong cpu_object_mem_size;
  jlong cpu_handles_mem_size;
  jlong cpu_exceptions_mem_size;
  jlong cpu_class_mem_size;
  jlong cpu_heap_end;

  jclass cuda_memory_class;
  jmethodID get_address_method;
  jmethodID get_size_method;
  jmethodID get_heap_end_method;
  
  jlong * info_space;

  //----------------------------------------------------------------------------
  //init device and function
  //----------------------------------------------------------------------------
  status = cuDeviceGet(&device, device_index);
  CHECK_STATUS(env, "Error in cuDeviceGet", status, device)

  status = cuCtxCreate(&context, CU_CTX_MAP_HOST, device);  
  CHECK_STATUS(env,"Error in cuCtxCreate", status, device)

  fatcubin = malloc(cubin_length);
  (*env)->GetByteArrayRegion(env, cubin_file, 0, cubin_length, fatcubin);

  status = cuModuleLoadFatBinary(&module, fatcubin);
  CHECK_STATUS(env, "Error in cuModuleLoad", status, device)
  free(fatcubin);

  status = cuModuleGetFunction(&function, module, "_Z5entryPcS_PiPxS1_S0_S0_i"); 
  CHECK_STATUS(env, "Error in cuModuleGetFunction", status, device)

  //----------------------------------------------------------------------------
  //get handles from java
  //----------------------------------------------------------------------------
  cuda_memory_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/FixedMemory");
  get_address_method = (*env)->GetMethodID(env, cuda_memory_class, "getAddress", "()J");
  get_size_method = (*env)->GetMethodID(env, cuda_memory_class, "getSize", "()J");
  get_heap_end_method = (*env)->GetMethodID(env, cuda_memory_class, "getHeapEndPtr", "()J");

  cpu_object_mem = (void *) (*env)->CallLongMethod(env, object_mem, get_address_method);
  cpu_object_mem_size = (*env)->CallLongMethod(env, object_mem, get_size_method);
  cpu_heap_end = (*env)->CallLongMethod(env, object_mem, get_heap_end_method);

  cpu_handles_mem = (void *) (*env)->CallLongMethod(env, handles_mem, get_address_method);
  cpu_handles_mem_size = (*env)->CallLongMethod(env, handles_mem, get_size_method);

  cpu_exceptions_mem = (void *) (*env)->CallLongMethod(env, exceptions_mem, get_address_method);
  cpu_exceptions_mem_size = (*env)->CallLongMethod(env, exceptions_mem, get_size_method);

  cpu_class_mem = (void *) (*env)->CallLongMethod(env, class_mem, get_address_method);
  cpu_class_mem_size = (*env)->CallLongMethod(env, class_mem, get_size_method);

  info_space_size = 1024;
  info_space = (jlong *) malloc(info_space_size);
  info_space[1] = (*env)->CallLongMethod(env, object_mem, get_heap_end_method);

  //----------------------------------------------------------------------------
  //allocate mem
  //----------------------------------------------------------------------------
  status = cuMemAlloc(&gpu_info_space, info_space_size);  
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_info_mem", status, device)

  status = cuMemAlloc(&gpu_object_mem, cpu_object_mem_size);  
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_object_mem", status, device)

  status = cuMemAlloc(&gpu_handles_mem, cpu_handles_mem_size); 
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_handles_mem", status, device)
    
  status = cuMemAlloc(&gpu_exceptions_mem, cpu_exceptions_mem_size); 
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_exceptions_mem", status, device)

  status = cuMemAlloc(&gpu_class_mem, cpu_class_mem_size);
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_class_mem", status, device)

  status = cuMemAlloc(&gpu_heap_end, 8);
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_heap_end", status, device)

  status = cuMemAlloc(&gpu_buffer_size, 8);
  CHECK_STATUS(env, "Error in cuMemAlloc: gpu_buffer_size", status, device)

  //----------------------------------------------------------------------------
  //set function parameters
  //----------------------------------------------------------------------------
  status = cuParamSetSize(function, (7 * sizeof(CUdeviceptr) + sizeof(int))); 
  CHECK_STATUS(env, "Error in cuParamSetSize", status, device)

  offset = 0;
  status = cuParamSetv(function, offset, (void *) &gpu_info_space, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env, "Error in cuParamSetv gpu_info_space", status, device)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(function, offset, (void *) &gpu_object_mem, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env, "Error in cuParamSetv: gpu_object_mem", status, device)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(function, offset, (void *) &gpu_handles_mem, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env, "Error in cuParamSetv: gpu_handles_mem %", status, device)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(function, offset, (void *) &gpu_heap_end, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env, "Error in cuParamSetv: gpu_heap_end", status, device)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(function, offset, (void *) &gpu_buffer_size, sizeof(CUdeviceptr));
  CHECK_STATUS(env, "Error in cuParamSetv: gpu_buffer_size", status, device)
  offset += sizeof(CUdeviceptr); 

  status = cuParamSetv(function, offset, (void *) &gpu_exceptions_mem, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env, "Error in cuParamSetv: gpu_exceptions_mem", status, device)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(function, offset, (void *) &gpu_class_mem, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env, "Error in cuParamSetv: gpu_class_mem", status, device)
  offset += sizeof(CUdeviceptr);

  status = cuParamSeti(function, offset, num_threads); 
  CHECK_STATUS(env, "Error in cuParamSetv: num_threads", status, device)
  offset += sizeof(int);

  //----------------------------------------------------------------------------
  //copy data
  //----------------------------------------------------------------------------
  status = cuMemcpyHtoD(gpu_info_space, info_space, info_space_size);
  CHECK_STATUS(env, "Error in cuMemcpyHtoD: info_space", status, device)

  status = cuMemcpyHtoD(gpu_object_mem, cpu_object_mem, cpu_object_mem_size);
  CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_object_mem", status, device)

  status = cuMemcpyHtoD(gpu_handles_mem, cpu_handles_mem, cpu_handles_mem_size);
  CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_handles_mem", status, device)

  status = cuMemcpyHtoD(gpu_class_mem, cpu_class_mem, cpu_class_mem_size);
  CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_class_mem", status, device)

  status = cuMemcpyHtoD(gpu_heap_end, &cpu_heap_end, sizeof(jlong));
  CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_heap_end", status, device)

  status = cuMemcpyHtoD(gpu_buffer_size, &cpu_object_mem_size, sizeof(jlong));
  CHECK_STATUS(env, "Error in cuMemcpyHtoD: gpu_buffer_size", status, device)

  status = cuMemcpyHtoD(gpu_exceptions_mem, cpu_exceptions_mem, cpu_exceptions_mem_size);
  CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device)

  //----------------------------------------------------------------------------
  //launch
  //----------------------------------------------------------------------------
  status = cuFuncSetBlockShape(function, block_shape_x, 1, 1);
  CHECK_STATUS(env, "Error in cuFuncSetBlockShape", status, device);

  status = cuLaunchGrid(function, grid_shape_x, 1);
  CHECK_STATUS(env, "Error in cuLaunchGrid", status, device)

  status = cuCtxSynchronize();  
  CHECK_STATUS(env, "Error in cuCtxSynchronize", status, device)

  //----------------------------------------------------------------------------
  //copy data back
  //----------------------------------------------------------------------------
  status = cuMemcpyDtoH(info_space, gpu_info_space, info_space_size);
  CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_info_space", status, device)

  cpu_heap_end = info_space[1];

  status = cuMemcpyDtoH(cpu_object_mem, gpu_object_mem, cpu_heap_end);
  CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_object_mem", status, device)

  status = cuMemcpyDtoH(cpu_exceptions_mem, gpu_exceptions_mem, cpu_exceptions_mem_size);
  CHECK_STATUS(env, "Error in cuMemcpyDtoH: gpu_exceptions_mem", status, device)

  //----------------------------------------------------------------------------
  //free resources
  //----------------------------------------------------------------------------
  free(info_space);

  cuMemFree(gpu_info_space);
  cuMemFree(gpu_object_mem);
  cuMemFree(gpu_handles_mem);
  cuMemFree(gpu_exceptions_mem);
  cuMemFree(gpu_class_mem);
  cuMemFree(gpu_heap_end);
  cuMemFree(gpu_buffer_size);

  cuCtxDestroy(context);
}
Exemple #4
0
int cuda_test_mmul_vmmap_hybrid(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	unsigned int *a_buf, *b_buf, *c_buf;
	unsigned long long int a_phys, b_phys, c_phys;
	unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int));
	int block_x, block_y, grid_x, grid_y;
	char fname[256];
	int ret = 0;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	struct timeval tv_mem_alloc_start;
	struct timeval tv_data_init_start;
	float data_init;
	struct timeval tv_conf_kern_start;
	struct timeval tv_close_start;
	float mem_alloc;
	float exec;
	float init_gpu;
	float configure_kernel;
	float close_gpu;
	float data_read;

	unsigned int dummy_b, dummy_c;
		

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/mmul_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "multiply");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	gettimeofday(&tv_mem_alloc_start, NULL);

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (a) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&a_phys, (void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (a) failed\n");
		return -1;
	}
	/*printf("a[]: Physical Address 0x%llx\n", a_phys);*/

	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (b) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&b_phys, (void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (b) failed\n");
		return -1;
	}
	/*printf("b[]: Physical Address 0x%llx\n", b_phys);*/

	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (c) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&c_phys, (void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (c) failed\n");
		return -1;
	}
	/*printf("c[]: Physical Address 0x%llx\n", c_phys);*/

	gettimeofday(&tv_data_init_start, NULL);

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			a_buf[idx++] = i;
		}
	}
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {
			b_buf[idx++] = i;
		}
	}

	gettimeofday(&tv_h2d_start, NULL);
	gettimeofday(&tv_h2d_end, NULL);


	gettimeofday(&tv_conf_kern_start, NULL);

	/* set kernel parameters */
	res = cuParamSeti(function, 0, a_dev);	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 4, a_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 8, b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 12, b_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 16, c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 20, c_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 24, n);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSetSize(function, 28);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);


	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_d2h_end, NULL);

	/* Read back */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			dummy_c = c[idx++];
		}
	}

	gettimeofday(&tv_close_start, NULL);

	res = cuMemUnmap((void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);




	tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
	init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv);
	mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_start, &tv_data_init_start, &tv);
	data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_start, &tv_conf_kern_start, &tv);
	configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_close_start, &tv_d2h_end, &tv);
	data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_close_start, &tv);
	close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("Init: %f\n", init_gpu);
	printf("MemAlloc: %f\n", mem_alloc);
	printf("DataInit: %f\n", data_init);
	printf("HtoD: %f\n", h2d);
	printf("KernConf: %f\n", configure_kernel);
	printf("Exec: %f\n", exec);
	printf("DtoH: %f\n", d2h);
	printf("DataRead: %f\n", data_read);
	printf("Close: %f\n", close_gpu);
	printf("Total: %f\n", total);


	return ret;
}
Exemple #5
0
int main (int argc, char *argv[])
{
	int matrix_dim = 32; /* default matrix_dim */
	int opt, option_index = 0;
	func_ret_t ret;
	const char *input_file = NULL;
	const char *cubin_file = NULL;
	float *m, *mm;
	struct timeval tv;
	CUdeviceptr d_m;
	CUcontext ctx;
	CUmodule mod;
	CUresult res;
	
	while ((opt = getopt_long(argc, argv, "::vs:i:c:", 
							  long_options, &option_index)) != -1 ) {
		switch(opt) {
		case 'c':
			cubin_file = optarg;
			break;
        case 'i':
			input_file = optarg;
			break;
        case 'v':
			do_verify = 1;
			break;
        case 's':
			matrix_dim = atoi(optarg);
			fprintf(stderr, "Currently not supported, use -i instead\n");
			fprintf(stderr, 
					"Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n",
					argv[0]);
			exit(EXIT_FAILURE);
        case '?':
			fprintf(stderr, "invalid option\n");
			break;
        case ':':
			fprintf(stderr, "missing argument\n");
			break;
        default:
			fprintf(stderr, 
					"Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n",
					argv[0]);
			exit(EXIT_FAILURE);
		}
	}
	
	if ( (optind < argc) || (optind == 1)) {
		fprintf(stderr, 
				"Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n",
				argv[0]);
		exit(EXIT_FAILURE);
	}
	
	if (!cubin_file) {
		printf("No cubin file specified!\n");
		exit(EXIT_FAILURE);
	}

	if (input_file) {
		printf("Reading matrix from file %s\n", input_file);
		ret = create_matrix_from_file(&m, input_file, &matrix_dim);
		if (ret != RET_SUCCESS) {
			m = NULL;
			fprintf(stderr, "error create matrix from file %s\n", input_file);
			exit(EXIT_FAILURE);
		}
	} else {
		printf("No input file specified!\n");
		exit(EXIT_FAILURE);
	}
	
	if (do_verify){
		print_matrix(m, matrix_dim);

		matrix_duplicate(m, &mm, matrix_dim);
	}

	/*
	 * call our common CUDA initialization utility function.
	 */
	res = cuda_driver_api_init(&ctx, &mod, cubin_file);
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_init failed: res = %u\n", res);
		return -1;
	}

	res = cuMemAlloc(&d_m, matrix_dim * matrix_dim * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed\n");
		return -1;
	}

	/*
	 * measurement start!
	 */
	time_measure_start(&tv);

	res = cuMemcpyHtoD(d_m, m, matrix_dim * matrix_dim * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %u\n", res);
		return -1;
	}
	
	lud_launch(mod, d_m, matrix_dim);
	
	res = cuMemcpyDtoH(m, d_m, matrix_dim * matrix_dim * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH failed: res = %u\n", res);
		return -1;
	}
	
	/*
	 * measurement end! will print out the time.
	 */
	time_measure_end(&tv);

	res = cuMemFree(d_m);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", res);
		return -1;
	}

	res = cuda_driver_api_exit(ctx, mod);
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_exit faild: res = %u\n", res);
		return -1;
	}

	if (do_verify){
		print_matrix(m, matrix_dim);
		printf(">>>Verify<<<<\n");
		lud_verify(mm, m, matrix_dim); 
		free(mm);
	}
	
	free(m);
	
	return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */
void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
{
	int j, k;
	int in, hid, out;
	float out_err, hid_err;
	struct timeval tv;
	
	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   
	
#ifdef GPU  
	int m = 0;
	float *partial_sum;
	float sum;
	float *input_weights_one_dim;
	float *input_weights_prev_one_dim;
	num_blocks = in / 16;  
	CUdeviceptr input_cuda;
	CUdeviceptr input_hidden_cuda;
	CUdeviceptr output_hidden_cuda;
	CUdeviceptr hidden_partial_sum;
	CUdeviceptr hidden_delta_cuda;
	CUdeviceptr input_prev_weights_cuda;
	CUcontext ctx;
	CUmodule mod;
	CUresult res;
	
	input_weights_one_dim = 
		(float *) malloc((in + 1) * (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = 
		(float *) malloc((in + 1) * (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	/* this preprocessing stage is added to correct the bugs of wrong 
	   memcopy using two-dimensional net->inputweights */
	for (k = 0; k <= in; k++) {	
		for (j = 0; j <= hid; j++) {
			input_weights_one_dim[m] = net->input_weights[k][j];
			input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
			m++;
		}
	}
	
	/*
	 * call our common CUDA initialization utility function.
	 */
	res = cuda_driver_api_init(&ctx, &mod, "./backprop.cubin");
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_init failed: res = %u\n", res);
		return ;
	}
	
	/*
	 * allocate device memory space
	 */
	res = cuMemAlloc(&input_cuda, (in + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&output_hidden_cuda, (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&hidden_delta_cuda, (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
#endif

#ifdef CPU
	printf("Performing CPU computation\n");
	bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid);
#endif

#ifdef GPU
	printf("Performing GPU computation\n");
    //printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
  
	/*
	 * measurement start!
	 */
	time_measure_start(&tv);

	res = cuMemcpyHtoD(input_cuda, net->input_units, (in + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}
	res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}

	res = bpnn_layerforward_launch(mod, input_cuda, output_hidden_cuda,
								   input_hidden_cuda, hidden_partial_sum,
								   in, hid);
	if (res != CUDA_SUCCESS) {
		printf("bpnn_layerforward failed: res = %u\n", res);
		return ;
	}
 
	cuCtxSynchronize();
  
#if 0
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess) {
		printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}
#endif
  
	res = cuMemcpyDtoH(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(layerforward) failed: res = %u\n", res);
		return ;
	}

	for (j = 1; j <= hid; j++) {
		sum = 0.0;
		for (k = 0; k < num_blocks; k++) {	
			sum += partial_sum[k * hid + j-1] ;
		}
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = (float) (1.0 / (1.0 + exp(-sum)));
	}
  #endif

	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
	
#ifdef CPU
	bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights);
#endif  

#ifdef GPU
	res = cuMemcpyHtoD(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}
	res = cuMemcpyHtoD(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}
	res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}

	res = bpnn_adjust_weights_launch(mod, hidden_delta_cuda, hid, 
									 input_cuda, in, 
									 input_hidden_cuda, 
									 input_prev_weights_cuda);
	if (res != CUDA_SUCCESS) {
		printf("bpnn_adjust_weights failed: res = %u\n", res);
		return ;
	}

	res = cuMemcpyDtoH(net->input_units, input_cuda, (in + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res);
		return ;
	}

	res = cuMemcpyDtoH(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res);
		return ;
	}
    
	cuMemFree(input_cuda);
	cuMemFree(output_hidden_cuda);
	cuMemFree(input_hidden_cuda);
	cuMemFree(hidden_partial_sum);
	cuMemFree(input_prev_weights_cuda);
	cuMemFree(hidden_delta_cuda);

	/*
	 * measurement end! will print out the time.
	 */
	time_measure_end(&tv);

	res = cuda_driver_api_exit(ctx, mod);
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_exit faild: res = %u\n", res);
		return ;
	}
	
	free(partial_sum);
	free(input_weights_one_dim);
	free(input_weights_prev_one_dim);
#endif   
}
Exemple #7
0
T run_function(const std::string& name, const T input, const int shiftValue) {
	const std::string test_source =
	"//\n"
	"// Generated by NVIDIA NVVM Compiler\n"
	"//\n"
	"// Compiler Build ID: CL-19856038\n"
	"// Cuda compilation tools, release 7.5, V7.5.17\n"
	"// Based on LLVM 3.4svn\n"
	"//\n"
	"\n"
	".version 4.3\n"
	".target sm_20\n"
	".address_size 64\n"
	"\n"
	"	// .globl	_Z10kernel_s32Piii\n"
	"\n"
	".visible .entry _Z10kernel_s32Piii(\n"
	"	.param .u64 _Z10kernel_s32Piii_param_0,\n"
	"	.param .u32 _Z10kernel_s32Piii_param_1,\n"
	"	.param .u32 _Z10kernel_s32Piii_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<4>;\n"
	"	.reg .b64 	%rd<3>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_s32Piii_param_0];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_s32Piii_param_1];\n"
	"	ld.param.u32 	%r2, [_Z10kernel_s32Piii_param_2];\n"
	"	cvta.to.global.u64 	%rd2, %rd1;\n"
	"	shr.s32 	%r3, %r1, %r2;\n"
	"	st.global.u32 	[%rd2], %r3;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_s64Pxxi\n"
	".visible .entry _Z10kernel_s64Pxxi(\n"
	"	.param .u64 _Z10kernel_s64Pxxi_param_0,\n"
	"	.param .u64 _Z10kernel_s64Pxxi_param_1,\n"
	"	.param .u32 _Z10kernel_s64Pxxi_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<2>;\n"
	"	.reg .b64 	%rd<5>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_s64Pxxi_param_0];\n"
	"	ld.param.u64 	%rd2, [_Z10kernel_s64Pxxi_param_1];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_s64Pxxi_param_2];\n"
	"	cvta.to.global.u64 	%rd3, %rd1;\n"
	"	shr.s64 	%rd4, %rd2, %r1;\n"
	"	st.global.u64 	[%rd3], %rd4;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_u32Pjji\n"
	".visible .entry _Z10kernel_u32Pjji(\n"
	"	.param .u64 _Z10kernel_u32Pjji_param_0,\n"
	"	.param .u32 _Z10kernel_u32Pjji_param_1,\n"
	"	.param .u32 _Z10kernel_u32Pjji_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<4>;\n"
	"	.reg .b64 	%rd<3>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_u32Pjji_param_0];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_u32Pjji_param_1];\n"
	"	ld.param.u32 	%r2, [_Z10kernel_u32Pjji_param_2];\n"
	"	cvta.to.global.u64 	%rd2, %rd1;\n"
	"	shr.u32 	%r3, %r1, %r2;\n"
	"	st.global.u32 	[%rd2], %r3;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"	// .globl	_Z10kernel_u64Pyyi\n"
	".visible .entry _Z10kernel_u64Pyyi(\n"
	"	.param .u64 _Z10kernel_u64Pyyi_param_0,\n"
	"	.param .u64 _Z10kernel_u64Pyyi_param_1,\n"
	"	.param .u32 _Z10kernel_u64Pyyi_param_2\n"
	")\n"
	"{\n"
	"	.reg .b32 	%r<2>;\n"
	"	.reg .b64 	%rd<5>;\n"
	"\n"
	"\n"
	"	ld.param.u64 	%rd1, [_Z10kernel_u64Pyyi_param_0];\n"
	"	ld.param.u64 	%rd2, [_Z10kernel_u64Pyyi_param_1];\n"
	"	ld.param.u32 	%r1, [_Z10kernel_u64Pyyi_param_2];\n"
	"	cvta.to.global.u64 	%rd3, %rd1;\n"
	"	shr.u64 	%rd4, %rd2, %r1;\n"
	"	st.global.u64 	[%rd3], %rd4;\n"
	"	ret;\n"
	"}\n"
	"\n"
	"\n"
	;
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, name.c_str()));
	T output;
	CUdeviceptr devOutput;
	cu_assert(cuMemAlloc(&devOutput, sizeof(output)));
	void * params[] = {&devOutput, (void*)&input, (void*)&shiftValue};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&output, devOutput, sizeof(output)));
	cu_assert(cuMemFree(devOutput));
	cu_assert(cuModuleUnload(modId));
	return output;
}
int main(int argc, char * argv[]) {
  CBlasTranspose transA, transB;
  size_t m, n, k;
  int d = 0;

  if (argc < 6 || argc > 7) {
    fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n"
                    "where:\n"
                    "  transA and transB  are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n"
                    "  m, n and k         are the sizes of the matrices\n"
                    "  device             is the GPU to use (default 0)\n", argv[0]);
    return 1;
  }

  char t;
  if (sscanf(argv[1], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[1]);
    return 1;
  }
  switch (t) {
    case 'N': case 'n': transA = CBlasNoTrans; break;
    case 'T': case 't': transA = CBlasTrans; break;
    case 'C': case 'c': transA = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1;
  }

  if (sscanf(argv[2], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[2]);
    return 2;
  }
  switch (t) {
    case 'N': case 'n': transB = CBlasNoTrans; break;
    case 'T': case 't': transB = CBlasTrans; break;
    case 'C': case 'c': transB = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1;
  }

  if (sscanf(argv[3], "%zu", &m) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]);
    return 3;
  }

  if (sscanf(argv[4], "%zu", &n) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]);
    return 4;
  }

  if (sscanf(argv[5], "%zu", &k) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]);
    return 5;
  }

  if (argc > 6) {
    if (sscanf(argv[6], "%d", &d) != 1) {
      fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]);
      return 6;
    }
  }

  srand(0);

  float complex alpha, beta, * A, * B, * C, * refC;
  CUdeviceptr dA, dB, dC, dD;
  size_t lda, ldb, ldc, dlda, dldb, dldc, dldd;

  CU_ERROR_CHECK(cuInit(0));

  CUdevice device;
  CU_ERROR_CHECK(cuDeviceGet(&device, d));

  CUcontext context;
  CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device));

  CUBLAShandle handle;
  CU_ERROR_CHECK(cuBLASCreate(&handle));

  alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
  beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;

  if (transA == CBlasNoTrans) {
    lda = (m + 1u) & ~1u;
    if ((A = malloc(lda * k * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex)));
    dlda /= sizeof(float complex);

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < m; i++)
        A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex),
                           m * sizeof(float complex), k };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }
  else {
    lda = (k + 1u) & ~1u;
    if ((A = malloc(lda * m * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex)));
    dlda /= sizeof(float complex);

    for (size_t j = 0; j < m; j++) {
      for (size_t i = 0; i < k; i++)
        A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex),
                           k * sizeof(float complex), m };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }

  if (transB == CBlasNoTrans) {
    ldb = (k + 1u) & ~1u;
    if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate B\n", stderr);
      return -2;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex)));
    dldb /= sizeof(float complex);

    for (size_t j = 0; j < n; j++) {
      for (size_t i = 0; i < k; i++)
        B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex),
                           k * sizeof(float complex), n };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }
  else {
    ldb = (n + 1u) & ~1u;
    if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate B\n", stderr);
      return -2;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex)));
    dldb /= sizeof(float complex);

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < n; i++)
        B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex),
                           n * sizeof(float complex), k };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }

  ldc = (m + 1u) & ~1u;
  if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) {
    fputs("Unable to allocate C\n", stderr);
    return -3;
  }
  if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) {
    fputs("Unable to allocate refC\n", stderr);
    return -4;
  }
  CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex)));
  dldc /= sizeof(float complex);
  CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex)));
  dldd /= sizeof(float complex);

  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++)
      refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
  }

  CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex),
                         0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex),
                         m * sizeof(float complex), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc);
  CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL));

  copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex),
           0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex),
           m * sizeof(float complex), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  float rdiff = 0.0f, idiff = 0.0f;
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++) {
      float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i]));
      if (d > rdiff)
        rdiff = d;
      d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i]));
      if (d > idiff)
        idiff = d;
    }
  }

  CUevent start, stop;
  CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC));
  CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC));

  CU_ERROR_CHECK(cuEventRecord(start, NULL));
  for (size_t i = 0; i < 20; i++)
    CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL));
  CU_ERROR_CHECK(cuEventRecord(stop, NULL));
  CU_ERROR_CHECK(cuEventSynchronize(stop));

  float time;
  CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
  time /= 20;

  CU_ERROR_CHECK(cuEventDestroy(start));
  CU_ERROR_CHECK(cuEventDestroy(stop));

  size_t flops = k * 6 + (k - 1) * 2;   // k multiplies and k - 1 adds per element
  if (alpha != 1.0f + 0.0f * I)
    flops += 6;                 // additional multiply by alpha
  if (beta != 0.0f + 0.0f * I)
    flops += 8;                 // additional multiply and add by beta
  float error = (float)flops * 2.0f * FLT_EPSILON;     // maximum per element error
  flops *= m * n;               // m * n elements

  bool passed = (rdiff <= error) && (idiff <= error);
  fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f,
          ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL");

  free(A);
  free(B);
  free(C);
  free(refC);
  CU_ERROR_CHECK(cuMemFree(dA));
  CU_ERROR_CHECK(cuMemFree(dB));
  CU_ERROR_CHECK(cuMemFree(dC));
  CU_ERROR_CHECK(cuMemFree(dD));

  CU_ERROR_CHECK(cuBLASDestroy(handle));

  CU_ERROR_CHECK(cuCtxDestroy(context));

  return (int)!passed;
}