void testCusolver(int rows, int cols, int nnz, int *row_ptr, int *col_index, double *values, double *valuesB){ // --- Initialize cuSPARSE cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle)); cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle)); cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream)); checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream)); cusparseMatDescr_t descrA; checkCudaErrors(cusparseCreateMatDescr(&descrA)); checkCudaErrors(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); double start, stop, time_to_solve; start = second(); // --- Device side dense matrix printf("\nAlloc GPU memory...\n"); double *d_A; checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double))); int *d_A_RowIndices; checkCudaErrors(cudaMalloc(&d_A_RowIndices, (rows + 1) * sizeof(int))); int *d_A_ColIndices; checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int))); double *d_x; checkCudaErrors(cudaMalloc(&d_x, rows * sizeof(double))); checkCudaErrors(cudaMemcpy(d_A, values, nnz * sizeof(double), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_RowIndices, row_ptr, (rows + 1) * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_ColIndices, col_index, nnz * sizeof(int), cudaMemcpyHostToDevice)); double *d_b; checkCudaErrors(cudaMalloc(&d_b, rows * sizeof(double))); checkCudaErrors(cudaMemcpy(d_b, valuesB, rows * sizeof(double), cudaMemcpyHostToDevice)); double *h_x = (double *)malloc(rows * sizeof(double)); double tol = 1.e-12; int reorder = 0; int singularity = 0; printf("\nProcessing in GPU using cusolver QR...\n"); //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(), // sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity)); checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, rows, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_b, tol, reorder, d_x, &singularity)); checkCudaErrors(cudaDeviceSynchronize()); stop = second(); time_to_solve = stop - start; checkCudaErrors(cudaMemcpy(h_x, d_x, rows * sizeof(double), cudaMemcpyDeviceToHost)); double minusOne = -1.0; double one = 1.0; double *d_r; checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rows)); checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rows, cudaMemcpyDeviceToDevice)); checkCudaErrors(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, rows, cols, nnz, &minusOne, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x, &one, d_r)); double *h_r; h_r = (double*) malloc(rows * sizeof(double)); checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rows, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_r, d_r, rows * sizeof(double), cudaMemcpyDeviceToHost)); double r_inf = vec_norminf(rows, h_r); printf("(GPU - cuSolver) Time (sec): %f\n", time_to_solve); printf("(Eigen) |b - A*x| = %E \n", r_inf); checkCudaErrors(cusparseDestroy(cusparse_handle)); checkCudaErrors(cusolverSpDestroy(cusolver_handle)); checkCudaErrors(cudaStreamDestroy(cudaStream)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_r)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_A_RowIndices)); checkCudaErrors(cudaFree(d_A_ColIndices)); free(h_x); free(h_r); }
/* * Function to be called */ void* device_thread(void* passing_ptr) { DataArray* data_arr_ptr = (DataArray*) passing_ptr; // casting passed pointer cuDoubleComplex* data_r_dev; cuDoubleComplex* data_k_dev; // init device, allocate suitable variables in gpu memory ... //alloc_data_device(data_arr_ptr); cudaMalloc((void**) &data_r_dev, sizeof(double complex)*N); // pinnable memory <- check here for cudaMallocHost (could be faster) cudaMalloc((void**) &data_k_dev, sizeof(double complex)*N); // pinnable memory data_arr_ptr->data_r_dev = &data_r_dev; // in this way it would be easier to handle pointer to arrays data_arr_ptr->data_k_dev = &data_k_dev; printf("data allocated by host thread\n"); // Each thread creates new stream ustomatically??? // http://devblogs.nvidia.com/parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ cudaStreamCreateWithFlags(streams_arr, cudaStreamNonBlocking); cudaStreamCreateWithFlags(streams_arr+1, cudaStreamNonBlocking); printf("streams created\n"); // synchronize after allocating memory - data on host should be allocated and ready for copying cudaDeviceSynchronize(); // CHECK IF THIS DO NOT CAUSE ERRORS! - should syncronize host and device irrespective on pthreads // cudaStreamSynchronize( <enum stream> ); // to synchronize only with stream !!! pthread_barrier_wait (&barrier); printf("1st barier device thread - allocating mem on gpu\n"); //copying data cudaMemcpyAsync( *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_r), N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, streams_arr[MEMORY_STREAM] ); // synchronize after copying data cudaDeviceSynchronize(); // should be used on pthread_barrier_wait (&barrier); printf("2nd barier device thread - copying data on gpu\n"); printf("data visible in device thread:\n"); /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) { printf("%lu.\t",ii); printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] )); printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] )); }*/ // synchronize after copying pthread_barrier_wait (&barrier); printf("3rd barier device thread - \n"); //copying data //cudaMemcpyAsync( *(data_arr_ptr->data_r), *(data_arr_ptr->data_r_dev), N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] ); cudaMemcpyAsync( *(data_arr_ptr->data_r), data_r_dev, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] ); // synchronize after copying back data cudaDeviceSynchronize(); // should be used on pthread_barrier_wait (&barrier); printf("4th barier device thread - \n"); cudaStreamDestroy(streams_arr[KERNEL_STREAM]); cudaStreamDestroy(streams_arr[MEMORY_STREAM]); cudaFree(data_r_dev); printf("device r space freed\n"); cudaFree(data_k_dev); cudaDeviceSynchronize(); printf("device k space freed\n"); printf("closing device thread\n"); pthread_exit(NULL); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cudaError_t error; printf("%s Starting...\n\n", argv[0]); printf("Starting up CUDA context...\n"); int dev = findCudaDevice(argc, (const char **)argv); uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; uint *d_InputKey, *d_InputVal, *d_OutputKey, *d_OutputVal; StopWatchInterface *hTimer = NULL; const uint N = 1048576; const uint DIR = 0; const uint numValues = 65536; const uint numIterations = 1; printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_InputKey = (uint *)malloc(N * sizeof(uint)); h_InputVal = (uint *)malloc(N * sizeof(uint)); h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); srand(2001); for (uint i = 0; i < N; i++) { h_InputKey[i] = rand() % numValues; h_InputVal[i] = i; } printf("Allocating and initializing CUDA arrays...\n\n"); error = cudaMalloc((void **)&d_InputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_InputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); int flag = 1; printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations); for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) { printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); uint threadCount = 0; for (uint i = 0; i < numIterations; i++) threadCount = bitonicSort( d_OutputKey, d_OutputVal, d_InputKey, d_InputVal, N / arrayLength, arrayLength, DIR ); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkStopTimer(&hTimer); printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations); if (arrayLength == N) { double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations; printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/dTimeSecs), dTimeSecs, arrayLength, 1, threadCount); } printf("\nValidating the results...\n"); printf("...reading back GPU results\n"); error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR); int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength); flag = flag && keysFlag && valuesFlag; printf("\n"); } printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); cudaFree(d_OutputVal); cudaFree(d_OutputKey); cudaFree(d_InputVal); cudaFree(d_InputKey); free(h_OutputValGPU); free(h_OutputKeyGPU); free(h_InputVal); free(h_InputKey); cudaDeviceReset(); exit(flag ? EXIT_SUCCESS : EXIT_FAILURE); }
int main() { int i; struct timeval start, stop; FILE *fd; char *key; cudaSetDevice(0); /* Allocate memory */ if ((key = (char *)malloc(40 * sizeof(char))) == NULL) { printf("Malloc failed!\n"); exit(EXIT_FAILURE); } cudaMallocHost((void **) &batchKeys, ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char)); cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t)); cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int)); cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t)); cudaMalloc((void **) &d_keys, ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char)); cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t)); cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int)); cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t)); /* Create 'BATCH_SIZE' number of random keys * and add them to batch table */ batchNo = 0; batchIndex[0] = 0; for(i = 0; i < BATCH_SIZE; i++) { gen_random(key, 30); add_to_batch(key, 30); } /* Start Time (execution + memory) */ #ifdef EXEC_MEM gettimeofday(&start, NULL); #endif // EXEC_MEM /* MemCpy Host -> Device */ cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] + strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char), cudaMemcpyHostToDevice); cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t), cudaMemcpyHostToDevice); cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int), cudaMemcpyHostToDevice); /* Start Time (execution only)*/ #ifndef EXEC_MEM gettimeofday(&start, NULL); #endif // EXEC_MEM /* Call the kernel */ CUDAhash(d_keys, d_index, d_len, d_res); /* Start Time (execution only)*/ #ifndef EXEC_MEM cudaDeviceSynchronize(); gettimeofday(&stop, NULL); #endif // EXEC_MEM /* MemCpy Device -> Host */ cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t), cudaMemcpyDeviceToHost); /* Start Time (execution + memory) */ #ifdef EXEC_MEM gettimeofday(&stop, NULL); #endif // EXEC_MEM #ifdef DEBUG for(i = 0; i < BATCH_SIZE; i++) { printf("%s\n", &batchKeys[batchIndex[i]]); printf("%u\n", hashedKeys[i]); } #endif // DEBUG /* Print Time */ fd = fopen("log.txt", "a+"); fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) - ((start.tv_sec * USECS) + start.tv_usec)); fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE / ((double)(((stop.tv_sec * USECS) + stop.tv_usec ) - ((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000); fclose(fd); #ifdef DEBUG printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) - ((start.tv_sec * USECS) + start.tv_usec)); #endif // DEBUG /* Free memory */ cudaFree(batchKeys); cudaFree(nKeys); cudaFree(hashedKeys); cudaFree(batchIndex); cudaFree(d_keys); cudaFree(d_len); cudaFree(d_res); cudaFree(d_index); return 0; }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; int it_cpu = 0; int it_gpu = 0; int err = 0; #ifdef LOGS set_iter_interval_print(10); char test_info[500]; snprintf(test_info, 500, "-i %d -g %d -t %d -f %s -l %d\n",p.n_gpu_threads, p.n_gpu_blocks,p.n_threads, p.file_name,p.switching_limit); start_log_file("cudaSingleSourceShortestPath", test_info); //printf("Com LOG\n"); #endif // Allocate int n_nodes, n_edges; // int n_nodes_o; read_input_size(n_nodes, n_edges, p); timer.start("Allocation"); Node * h_nodes = (Node *) malloc(sizeof(Node) * n_nodes); //*************************** Alocando Memoria para o Gold ************************************* Gold * gold = (Gold *) malloc(sizeof(Gold) * n_nodes); if (p.mode == 1) { // ********************** Lendo O gold ********************************* read_gold(gold, p); // ********************************************************************** } //*********************************************************************************************** Node * d_nodes; cudaStatus = cudaMalloc((void**) &d_nodes, sizeof(Node) * n_nodes); Edge * h_edges = (Edge *) malloc(sizeof(Edge) * n_edges); Edge * d_edges; cudaStatus = cudaMalloc((void**) &d_edges, sizeof(Edge) * n_edges); std::atomic_int *h_color = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_color; cudaStatus = cudaMalloc((void**) &d_color, sizeof(int) * n_nodes); std::atomic_int *h_cost = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_cost; cudaStatus = cudaMalloc((void**) &d_cost, sizeof(int) * n_nodes); int * h_q1 = (int *) malloc(n_nodes * sizeof(int)); int * d_q1; cudaStatus = cudaMalloc((void**) &d_q1, sizeof(int) * n_nodes); int * h_q2 = (int *) malloc(n_nodes * sizeof(int)); int * d_q2; cudaStatus = cudaMalloc((void**) &d_q2, sizeof(int) * n_nodes); std::atomic_int h_head[1]; int * d_head; cudaStatus = cudaMalloc((void**) &d_head, sizeof(int)); std::atomic_int h_tail[1]; int * d_tail; cudaStatus = cudaMalloc((void**) &d_tail, sizeof(int)); std::atomic_int h_threads_end[1]; int * d_threads_end; cudaStatus = cudaMalloc((void**) &d_threads_end, sizeof(int)); std::atomic_int h_threads_run[1]; int * d_threads_run; cudaStatus = cudaMalloc((void**) &d_threads_run, sizeof(int)); int h_num_t[1]; int * d_num_t; cudaStatus = cudaMalloc((void**) &d_num_t, sizeof(int)); int h_overflow[1]; int * d_overflow; cudaStatus = cudaMalloc((void**) &d_overflow, sizeof(int)); std::atomic_int h_gray_shade[1]; int * d_gray_shade; cudaStatus = cudaMalloc((void**) &d_gray_shade, sizeof(int)); std::atomic_int h_iter[1]; int * d_iter; cudaStatus = cudaMalloc((void**) &d_iter, sizeof(int)); cudaDeviceSynchronize(); CUDA_ERR(); ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2); timer.stop("Allocation"); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); int source; read_input(source, h_nodes, h_edges, p); for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); timer.stop("Initialization"); //timer.print("Initialization", 1); // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); for (int rep = 0; rep < p.n_reps; rep++) { // Reset for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } it_cpu = 0; it_gpu = 0; h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); // if(rep >= p.n_warmup) timer.start("Kernel"); #ifdef LOGS start_iteration(); #endif // Run first iteration in master CPU thread h_num_t[0] = 1; int pid; int index_i, index_o; for (index_i = 0; index_i < h_num_t[0]; index_i++) { pid = h_q1[index_i]; h_color[pid].store(BLACK); int cur_cost = h_cost[pid].load(); for (int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x); i++) { int id = h_edges[i].x; int cost = h_edges[i].y; cost += cur_cost; h_cost[id].store(cost); h_color[id].store(GRAY0); index_o = h_tail[0].fetch_add(1); h_q2[index_o] = id; } } h_num_t[0] = h_tail[0].load(); h_tail[0].store(0); h_threads_run[0].fetch_add(1); h_gray_shade[0].store(GRAY1); h_iter[0].fetch_add(1); // if(rep >= p.n_warmup) timer.stop("Kernel"); // Pointers to input and output queues int * h_qin = h_q2; int * h_qout = h_q1; int * d_qin = d_q2; int * d_qout = d_q1; const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0; const int GPU_EXEC = (p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0; // Run subsequent iterations on CPU or GPU until number of input queue elements is 0 while (*h_num_t != 0) { if ((*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit it_cpu = it_cpu + 1; // if(rep >= p.n_warmup) timer.start("Kernel"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { h_qin = h_q1; h_qout = h_q2; } else { h_qin = h_q2; h_qout = h_q1; } std::thread main_thread(run_cpu_threads, h_nodes, h_edges, h_cost, h_color, h_qin, h_qout, h_num_t, h_head, h_tail, h_threads_end, h_threads_run, h_gray_shade, h_iter, p.n_threads, p.switching_limit, GPU_EXEC); main_thread.join(); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.stop("Kernel"); } else if ((*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit it_gpu = it_gpu + 1; // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_cost, h_cost, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_color, h_color, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_run, h_threads_run, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_end, h_threads_end, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_overflow, h_overflow, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q1, h_q1, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q2, h_q2, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_iter, h_iter, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { d_qin = d_q1; d_qout = d_q2; } else { d_qin = d_q2; d_qout = d_q1; } // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_num_t, h_num_t, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_tail, h_tail, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_head, h_head, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_gray_shade, h_gray_shade, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // if(rep >= p.n_warmup) timer.start("Kernel"); assert( p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads, d_nodes, d_edges, d_cost, d_color, d_qin, d_qout, d_num_t, d_head, d_tail, d_threads_end, d_threads_run, d_overflow, d_gray_shade, d_iter, p.switching_limit, CPU_EXEC, sizeof(int) * (W_QUEUE_SIZE + 3)); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Kernel"); // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_tail, d_tail, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_iter, d_iter, sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_cost, d_cost, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_color, d_color, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_run, d_threads_run, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_end, d_threads_end, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q1, d_q1, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q2, d_q2, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); } } #ifdef LOGS end_iteration(); #endif // printf("IT CPU:%d\t",it_cpu); //printf("IT GPU:%d\n",it_gpu); if (p.mode == 1) { err = newest_verify(h_cost, n_nodes, n_nodes, gold, it_cpu, it_gpu); } //err=new_verify(h_cost, n_nodes,,it_cpu,it_gpu); if (err > 0) { printf("Errors: %d\n", err); read_input(source, h_nodes, h_edges, p); read_gold(gold, p); } else { printf(".ITERATION %d\n", rep); } #ifdef LOGS log_error_count(err); #endif // Ler a entrada novamente //read_input(source, h_nodes, h_edges, p); //read_gold(gold,p); } // end of iteration #ifdef LOGS end_log_file(); #endif // timer.print("Allocation", 1); //timer.print("Copy To Device", p.n_reps); // timer.print("Kernel", p.n_reps); // timer.print("Copy Back and Merge", p.n_reps); if (p.mode == 0) { create_output(h_cost, n_nodes, n_edges, std::string(p.comparison_file)); } // Verify answer verify(h_cost, n_nodes, p.comparison_file); // Free memory timer.start("Deallocation"); free(h_nodes); free(h_edges); free(h_color); free(h_cost); free(h_q1); free(h_q2); cudaStatus = cudaFree(d_nodes); cudaStatus = cudaFree(d_edges); cudaStatus = cudaFree(d_cost); cudaStatus = cudaFree(d_color); cudaStatus = cudaFree(d_q1); cudaStatus = cudaFree(d_q2); cudaStatus = cudaFree(d_num_t); cudaStatus = cudaFree(d_head); cudaStatus = cudaFree(d_tail); cudaStatus = cudaFree(d_threads_end); cudaStatus = cudaFree(d_threads_run); cudaStatus = cudaFree(d_overflow); cudaStatus = cudaFree(d_iter); cudaStatus = cudaFree(d_gray_shade); CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Deallocation"); //timer.print("Deallocation", 1); // Release timers timer.release("Allocation"); timer.release("Initialization"); timer.release("Copy To Device"); timer.release("Kernel"); timer.release("Copy Back and Merge"); timer.release("Deallocation"); printf("Test Passed\n"); return 0; }
int main(int argc, char *argv[]) { // needed to work correctly with piped benchmarkrunner setlinebuf(stdout); setlinebuf(stdin); int n_indices = 1; int n_dimensions = 1; char inBuf[200]; // ridiculously large input buffer. bool isFirst = true; do { // Allocate memory for the arrays int *h_indices = 0; double *h_outputGPU = 0; try { h_indices = new int [n_indices * n_dimensions]; h_outputGPU = new double [n_indices * n_dimensions]; } catch (std::exception e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } int *d_indices; double *d_output; try { cudaError_t cudaResult; cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int)); if (cudaResult != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(cudaResult)); } } catch (std::runtime_error e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } // Initialize the indices (done on the host) for(int i = 0; i < n_indices; i++) { h_indices[i] = i; } // Copy the indices to the device cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); // Execute the QRNG on the device int n_vec; sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec); cudaDeviceSynchronize(); cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost); // Cleanup and terminate delete h_indices; cudaFree(d_indices); cudaFree(d_output); if(!isFirst) { printf("RESULT "); for(int i = 0; i < std::min(n_indices,10); i++) printf("%f ", h_outputGPU[i]); printf("\n"); } else { printf("OK\n"); isFirst = false; } delete h_outputGPU; fgets(inBuf, 200, stdin); if (sscanf(inBuf, "%u", &n_indices) == 0) { // if input is not a number, it has to be "EXIT" if (strncmp("EXIT",inBuf,4)==0) { printf("OK\n"); break; } else { printf("ERROR. Bad input: %s\n", inBuf); break; } } } while (true); cudaDeviceReset(); return -1; }
int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { int dev_id = device_map[thr_id]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const int swap = 1; // to toggle nonce endian uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8 if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0; if (!init[thr_id]) { cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); CUDA_LOG_ERROR(); } cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput); quark_skein512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); CUDA_SAFE_CALL(cudaDeviceSynchronize()); init[thr_id] = true; } uint32_t endiandata[20]; for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); skein512_cpu_setBlock_80((void*)endiandata); cuda_check_cpu_setTarget(ptarget); do { int order = 0; // Hash with CUDA skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8]; endiandata[19] = swab32_if(foundNonce, swap); skein2hash(vhash64, endiandata); if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); work_set_target_ratio(work, vhash64); if (secNonce != 0) { if (!opt_quiet) applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce)); endiandata[19] = swab32_if(secNonce, swap); skein2hash(vhash64, endiandata); if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) work_set_target_ratio(work, vhash64); pdata[21] = swab32_if(secNonce, !swap); res++; } pdata[19] = swab32_if(foundNonce, !swap); return res; } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); } } if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } pdata[19] += throughput; } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce; return 0; }
int main(int argc, char **argv) { uchar4 *h_rgbaImage, *d_rgbaImage; unsigned char *h_greyImage, *d_greyImage; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW1_output.png"; reference_file = "HW1_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW1_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 6: useEpsCheck=true; input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); perPixelError = atof(argv[4]); globalError = atof(argv[5]); break; default: std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); GpuTimer timer; timer.Start(); //call the students' code lineDetect(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } size_t numPixels = numRows()*numCols(); checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost)); //check results and output the grey image postProcess(output_file, h_greyImage); referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols()); postProcess(reference_file, h_greyImage); //generateReferenceImage(input_file, reference_file); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); cleanup(); return 0; }
int main(int argc, char **argv) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW2_output.png"; reference_file = "HW2_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW2_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 6: useEpsCheck=true; input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); perPixelError = atof(argv[4]); globalError = atof(argv[5]); break; default: std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred, &h_filter, &filterWidth, input_file); allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth); GpuTimer timer; timer.Start(); //call the students' code your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("Your GPU code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } //check results and output the blurred image size_t numPixels = numRows()*numCols(); //copy the output back to the host checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); postProcess(output_file, h_outputImageRGBA); timer.Start(); referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, numRows(), numCols(), h_filter, filterWidth); timer.Stop(); printf("Your CPU code ran in: %f msecs.\n", timer.Elapsed()); postProcess(reference_file, h_outputImageRGBA); // Cheater easy way with OpenCV //generateReferenceImage(input_file, reference_file, filterWidth); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); cleanUp(); return 0; }
int main(int argc, char *argv[]) { int i,j,k,n; int nx = NX; int ny = NY; int nz = NZ; int nsteps = NSTEPS; if( argc >= 4 ) { nx = atoi( argv[1] ); ny = atoi( argv[2] ); nz = atoi( argv[3] ); } if( argc >=5 ) nsteps = atoi( argv[4] ); StartTimer(); size_t nbytes = nx * ny * nz * sizeof(float); float *restrict x = (float*)malloc( nbytes ); float *restrict y = (float*)malloc( nbytes ); float *restrict z = (float*)malloc( nbytes ); float *restrict f = (float*)malloc( nbytes ); float *restrict g = (float*)malloc( nbytes ); float *restrict fp = (float*)malloc( nbytes ); float *restrict gp = (float*)malloc( nbytes ); if( 0==x || 0==y || 0==z || 0==f || 0==g || 0==fp || 0==gp ) { printf( "couldn't allocate fields on the host\n" ); return (-1); } float dx = 2.0f/(nx-1); float dy = 2.0f/(ny-1); float dz = 2.0f/(nz-1); float dt = 0.00000005f; // in order for the system to be numerically dt < dx!!! // initialize the grid to run from -1 to 1 in each direction for (i=0; i<nx; i++) { for (j=0; j<ny; j++) { for (k=0; k<nz; k++) { int offset = OFFSET(i, j, k, ny, nz); x[offset] = -1.0f + (i)*dx; y[offset] = -1.0f + (j)*dy; z[offset] = -1.0f + (k)*dz; } } } // initialize the field to be a gaussian for (i=0; i<nx; i++) { for (j=0; j<ny; j++) { for (k=0; k<nz; k++) { int offset = OFFSET(i, j, k, ny, nz); f[offset] = 0.2f*exp( - ( x[offset]*x[offset] + y[offset]*y[offset] + z[offset]*z[offset] ) / 0.05f); g[offset] = 0.0f; } } } // output the initial data when there are an even number of points, // pick a line closest to a coordinate axis FILE *fPtr = fopen("wave3d.xline", "w"); for (i=0; i<nx; i++) { int offset = OFFSET(i, ny/2, nz/2, ny, nz); fprintf(fPtr,"%5.3f %10.6e\n",x[offset],f[offset]); } fprintf(fPtr,"\n"); float step = 0.0f; int printevery = 20; printf("step = %9.6f \n",step); cudaProfilerStart(); #pragma acc enter data copyin(x[0:nx*ny*nz], y[0:nx*ny*nz], z[0:nx*ny*nz], f[0:nx*ny*nz], g[0:nx*ny*nz]) #pragma acc enter data create(fp[0:nx*ny*nz], gp[0:nx*ny*nz]) { for (n=0; n<nsteps; n++) { step = step + dt; if (((n+1)%printevery)==0) printf("step = %9.6f \n",step); #pragma acc kernels { // predictor #pragma acc loop independent collapse(2) gang for (i=0; i<nx; i++) { for (j=0; j<ny; j++) { #pragma acc loop independent vector for (k=0; k<nz; k++) { int offset = OFFSET(i, j, k, ny, nz); fp[offset] = f[offset] + dt * g[offset]; } } } // static boundaries #pragma acc loop independent collapse(2) for (j=0; j<ny; j++) { for (k=0; k<nz; k++) { int xbeg = OFFSET(0, j, k, ny, nz); int xend = OFFSET(nx-1, j, k, ny, nz); gp[xbeg] = g[xbeg]; gp[xend] = g[xend]; } } #pragma acc loop independent collapse(2) for (i=0; i<nx; i++) { for (k=0; k<nz; k++) { int ybeg = OFFSET(i, 0, k, ny, nz); int yend = OFFSET(i, ny-1, k, ny, nz); gp[ybeg] = g[ybeg]; gp[yend] = g[yend]; } } #pragma acc loop independent collapse(2) for (i=0; i<nx; i++) { for (j=0; j<ny; j++) { int zbeg = OFFSET(i, j, 0, ny, nz); int zend = OFFSET(i, j, nz-1, ny, nz); gp[zbeg] = g[zbeg]; gp[zend] = g[zend]; } } // use the predictor to update gp #pragma acc loop independent collapse(2) gang for (i=1; i<nx-1; i++) { for (j=1; j<ny-1; j++) { #pragma acc loop independent vector for (k=1; k<nz-1; k++) { int current = OFFSET(i, j, k, ny, nz); int next_x = OFFSET(i+1, j, k, ny, nz); int next_y = OFFSET(i, j+1, k, ny, nz); int next_z = OFFSET(i, j, k+1, ny, nz); int prev_x = OFFSET(i-1, j, k, ny, nz); int prev_y = OFFSET(i, j-1, k, ny, nz); int prev_z = OFFSET(i, j, k-1, ny, nz); gp[current] = g[current] + dt * ( (fp[next_x] - 2.0f * fp[current] + fp[prev_x]) / dx / dx + (fp[next_y] - 2.0f * fp[current] + fp[prev_y]) / dy / dy + (fp[next_z] - 2.0f * fp[current] + fp[prev_z]) / dz / dz ); } } } // use the average g's to update f #pragma acc loop independent collapse(2) gang for (i=0; i<nx; i++) { for (j=0; j<ny; j++) { #pragma acc loop independent vector for (k=0; k<nz; k++) { int offset = OFFSET(i, j, k, ny, nz); fp[offset] = f[offset] + dt * (0.5f * (g[offset] + gp[offset])); } } } // now update all the variables #pragma acc loop independent collapse(2) gang for (i=0; i<nx; i++) { for (j=0; j<ny; j++) { #pragma acc loop independent vector for (int k=0; k<nz; k++) { int offset = OFFSET(i, j, k, ny, nz); f[offset] = fp[offset]; g[offset] = gp[offset]; } } } } // pragma acc kernels if (((n+1)%printevery)==0) { #pragma acc update host(x[0:nx*(ny*nz)], f[0:nx*(ny*nz)]) for (i=0; i<nx; i++) { int offset = OFFSET(i, ny/2, nz/2, ny, nz); fprintf(fPtr,"%5.3f %10.6e\n",x[offset],f[offset]); } fprintf(fPtr,"\n"); } } // for nsteps } // pragma acc data cudaProfilerStop(); cudaDeviceSynchronize(); free(x); free(y); free(z); free(f); free(g); free(fp); free(gp); float totalTime = GetTimer(); printf("Total time: %f seconds\n", totalTime / 1000.0f); exit(0); }
int main(int argc, char *argv[]) { typedef int IndexType; typedef double ValueType; typedef cusp::device_memory MemorySpace; //typedef cusp::row_major Orientation; bool success = true; bool verbose = false; try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString("This test performance of block multiply routines.\n"); IndexType n = 32; CLP.setOption("n", &n, "Number of mesh points in the each direction"); IndexType nrhs_begin = 32; CLP.setOption("begin", &nrhs_begin, "Staring number of right-hand-sides"); IndexType nrhs_end = 512; CLP.setOption("end", &nrhs_end, "Ending number of right-hand-sides"); IndexType nrhs_step = 32; CLP.setOption("step", &nrhs_step, "Increment in number of right-hand-sides"); IndexType nits = 10; CLP.setOption("nits", &nits, "Number of multiply iterations"); int device_id = 0; CLP.setOption("device", &device_id, "CUDA device ID"); CLP.parse( argc, argv ); // Set CUDA device cudaSetDevice(device_id); cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); // create 3D Poisson problem cusp::csr_matrix<IndexType, ValueType, MemorySpace> A; cusp::gallery::poisson27pt(A, n, n, n); std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , " << "col_time , col_gflops" << std::endl; for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) { double flops = 2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs); // test row-major storage cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row( A.num_rows, nrhs, 0); cusp::detail::timer row_timer; row_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_row, y_row); } cudaDeviceSynchronize(); double row_time = row_timer.seconds_elapsed() / nits; double row_gflops = 1.0e-9 * flops / row_time; // test column-major storage cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col( A.num_rows, nrhs, 0); cusp::detail::timer col_timer; col_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_col, y_col); } cudaDeviceSynchronize(); double col_time = col_timer.seconds_elapsed() / nits; double col_gflops = 1.0e-9 * flops / col_time; std::cout << nrhs << " , " << A.num_rows << " , " << A.num_entries << " , " << row_time << " , " << row_gflops << " , " << col_time << " , " << col_gflops << std::endl; } } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }
/////////////////////// // Main program entry /////////////////////// int main(int argc, char** argv) { unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ; int rank, numberOfProcesses; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]); exit(1); } InitializeMPI(&argc, &argv, &rank, &numberOfProcesses); AssignDevices(rank); ECCCheck(rank); // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Copy constants to Constant Memory on the GPUs CopyToConstantMemory(c0, c1); // Decompose along the z-axis const int _Nz = Nz/numberOfProcesses; const int dt_size = sizeof(_DOUBLE_); // Host memory allocations _DOUBLE_ *u_new, *u_old; _DOUBLE_ *h_Uold; u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); if (rank == 0) { h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); } init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate host subdomains _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses]; _DOUBLE_ *left_send_buffer, *left_receive_buffer; _DOUBLE_ *right_send_buffer, *right_receive_buffer; h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); } } #endif right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank); // GPU stream operations cudaStream_t compute_stream; cudaStream_t data_stream; checkCuda(cudaStreamCreate(&compute_stream)); checkCuda(cudaStreamCreate(&data_stream)); // GPU Memory Operations size_t pitch_bytes, pitch_gc_bytes; _DOUBLE_ *d_s_Unews, *d_s_Uolds; _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer; _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer; checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); // Copy subdomains from host to device and get walltime double HtD_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); unsigned int ghost_width = 1; int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); //MPI_Status status; MPI_Status status[numberOfProcesses]; MPI_Request gather_send_request[numberOfProcesses]; MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses]; MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses]; double compute_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for(unsigned int iterations = 0; iterations < max_iters; iterations++) { // Compute right boundary data on device 0 if (rank == 0) { int kstart = (_Nz+1)-ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank])); } else { int kstart = 1; int kstop = 1+ghost_width; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank])); } // Compute inner nodes for device 0 if (rank == 0) { int kstart = 1; int kstop = (_Nz+1)-ghost_width; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Compute inner nodes for device 1 else { int kstart = 1+ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Receive data from device 1 if (rank == 0) { MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank])); } else { MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank])); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); } else { MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE)); } else { MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE)); } // Swap pointers on the host checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews, d_s_Uolds); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Copy data from device to host double DtH_timer = 0; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Gather results from subdomains MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank])); if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank])); merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i); } } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); } #endif if (rank == 0) { float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); } Finalize(); // Free device memory checkCuda(cudaFree(d_s_Unews)); checkCuda(cudaFree(d_s_Uolds)); checkCuda(cudaFree(d_right_send_buffer)); checkCuda(cudaFree(d_left_send_buffer)); checkCuda(cudaFree(d_right_receive_buffer)); checkCuda(cudaFree(d_left_receive_buffer)); // Free host memory checkCuda(cudaFreeHost(h_s_Unews)); checkCuda(cudaFreeHost(h_s_Uolds)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { checkCuda(cudaFreeHost(h_s_rbuf[i])); } free(h_Uold); } #endif checkCuda(cudaFreeHost(left_send_buffer)); checkCuda(cudaFreeHost(left_receive_buffer)); checkCuda(cudaFreeHost(right_send_buffer)); checkCuda(cudaFreeHost(right_receive_buffer)); checkCuda(cudaDeviceReset()); free(u_old); free(u_new); return 0; }
int main(int argc, char **argv) { unsigned int *inputVals; unsigned int *inputPos; unsigned int *outputVals; unsigned int *outputPos; size_t numElems; std::string input_file; std::string template_file; std::string output_file; std::string reference_file = "red_eye_effect.gold"; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 3: input_file = std::string(argv[1]); template_file = std::string(argv[2]); output_file = "HW4_output.png"; break; case 4: input_file = std::string(argv[1]); template_file = std::string(argv[2]); output_file = std::string(argv[3]); break; default: std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file); GpuTimer timer; timer.Start(); //call the students' code your_sort(inputVals, inputPos, outputVals, outputPos, numElems); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); printf("\n"); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } //check results and output the red-eye corrected image postProcess(outputVals, outputPos, numElems, output_file); // check code moved from HW4.cu /**************************************************************************** * You can use the code below to help with debugging, but make sure to * * comment it out again before submitting your assignment for grading, * * otherwise this code will take too much time and make it seem like your * * GPU implementation isn't fast enough. * * * * This code MUST RUN BEFORE YOUR CODE in case you accidentally change * * the input values when implementing your radix sort. * * * * This code performs the reference radix sort on the host and compares your * * sorted values to the reference. * * * * Thrust containers are used for copying memory from the GPU * * ************************************************************************* */ thrust::device_ptr<unsigned int> d_inputVals(inputVals); thrust::device_ptr<unsigned int> d_inputPos(inputPos); thrust::host_vector<unsigned int> h_inputVals(d_inputVals, d_inputVals+numElems); thrust::host_vector<unsigned int> h_inputPos(d_inputPos, d_inputPos + numElems); thrust::host_vector<unsigned int> h_outputVals(numElems); thrust::host_vector<unsigned int> h_outputPos(numElems); reference_calculation(&h_inputVals[0], &h_inputPos[0], &h_outputVals[0], &h_outputPos[0], numElems); //postProcess(valsPtr, posPtr, numElems, reference_file); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); thrust::device_ptr<unsigned int> d_outputVals(outputVals); thrust::device_ptr<unsigned int> d_outputPos(outputPos); thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals, d_outputVals + numElems); thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos, d_outputPos + numElems); checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems); //checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems); checkCudaErrors(cudaFree(inputVals)); checkCudaErrors(cudaFree(inputPos)); checkCudaErrors(cudaFree(outputVals)); checkCudaErrors(cudaFree(outputPos)); return 0; }
void testCuda(int m, int n, int nnz, std::vector<int>& rows, std::vector<int>& cols, std::vector<double>& values, double* matB){ double tol=1e-9; double start, stop, time_to_build, time_to_solve; int cudaDevice = 0; checkCudaErrors(cudaSetDevice(cudaDevice)); cudaDeviceProp prop; cudaGetDeviceProperties(&prop, cudaDevice); printf("Device Number: %d\n", cudaDevice); printf(" Device name: %s\n", prop.name); checkCudaErrors(cudaDeviceReset()); size_t mem_tot = 0; size_t mem_free = 0; cudaMemGetInfo(&mem_free, & mem_tot); printf("\nFree memory: %d", mem_free); MatSparse matA; matA.setSize(m, n); std::vector<int> I, J; std::vector<double> V; for (int k = 0; k < nnz; k++){ double _val = values[k]; int i = rows[k]; int j = cols[k]; if (fabs(_val) > tol){ I.push_back(i-1); J.push_back(j-1); V.push_back(_val); } } start = second(); matA.fromTruples(I, J, V); stop = second(); time_to_build = stop - start; std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl; // ******************************** GPU SOLVER ******************************** // // --- Initialize cuSPARSE cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle)); cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle)); cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream)); checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream)); cusparseMatDescr_t descrA; checkCudaErrors(cusparseCreateMatDescr(&descrA)); checkCudaErrors(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); printf("\nAlloc GPU memory...\n"); double *d_A; checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double))); int *d_A_RowIndices; checkCudaErrors(cudaMalloc(&d_A_RowIndices, (m + 1) * sizeof(int))); int *d_A_ColIndices; checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int))); double *d_x; checkCudaErrors(cudaMalloc(&d_x, m * sizeof(double))); double *d_b; checkCudaErrors(cudaMalloc(&d_b, m * sizeof(double))); printf("\nError: %s", cudaGetErrorString(cudaGetLastError())); printf("\nCopying data...\n"); checkCudaErrors(cudaMemcpy(d_A, matA.valuesPtr(), nnz * sizeof(double), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_RowIndices, matA.RowPtr(), (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_ColIndices, matA.ColIdxPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_b, matB, m * sizeof(double), cudaMemcpyHostToDevice)); double *h_x = (double *)malloc(m * sizeof(double)); printf("\nError: %s", cudaGetErrorString(cudaGetLastError())); cudaMemGetInfo(&mem_free, &mem_tot); printf("\nFree memory: %d", mem_free); int reorder = 0; int singularity = 0; start = second(); //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(), // sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity)); checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_b, tol, reorder, d_x, &singularity)); checkCudaErrors(cudaDeviceSynchronize()); stop = second(); time_to_solve = stop - start; checkCudaErrors(cudaMemcpy(h_x, d_x, m * sizeof(double), cudaMemcpyDeviceToHost)); // for (int k=0; k<mA.getNumRows(); k++) solution[k] = h_x[k]; checkCudaErrors(cusparseDestroy(cusparse_handle)); checkCudaErrors(cusolverSpDestroy(cusolver_handle)); checkCudaErrors(cudaStreamDestroy(cudaStream)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_A_RowIndices)); checkCudaErrors(cudaFree(d_A_ColIndices)); free(h_x); std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl; std::cerr << "Time to Solve in GPU (second): " << time_to_solve << std::endl; std::cerr << "done!"; // ****************************************************************************** // }
void CinderCUDASampleApp::update() { generateCUDAImage(); cudaDeviceSynchronize(); }
int main(int argc, char **argv) { int N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; float dot; float *r, *p, *Ax; int k; float alpha, beta, alpham1; printf("Starting [%s]...\n", sSDKname); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); if (!deviceProp.managedMemory) { // This samples requires being run on a device that supports Unified Memory fprintf(stderr, "Unified Memory not supported on this device\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_WAIVED); } // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); /* Generate a random tridiagonal symmetric matrix in CSR format */ N = 1048576; nz = (N-2)*3 + 4; cudaMallocManaged((void **)&I, sizeof(int)*(N+1)); cudaMallocManaged((void **)&J, sizeof(int)*nz); cudaMallocManaged((void **)&val, sizeof(float)*nz); genTridiag(I, J, val, N, nz); cudaMallocManaged((void **)&x, sizeof(float)*N); cudaMallocManaged((void **)&rhs, sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // temp memory for CG checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float))); cudaDeviceSynchronize(); for (int i=0; i < N; i++) { r[i] = rhs[i]; } alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax); cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax); cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } printf("Final residual: %e\n",sqrt(r1)); fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); cudaFree(I); cudaFree(J); cudaFree(val); cudaFree(x); cudaFree(rhs); cudaFree(r); cudaFree(p); cudaFree(Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE"); exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); }
/** * Synchronizes the CUDA device in the case of a GPU build */ inline void SynchronizeCUDA() { #ifdef __CUDA_BACKEND__ cudaDeviceSynchronize(); #endif }
void runAutoTest(int argc, char *argv[]) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); // Ensure that SM 2.0 or higher device is available before running checkDeviceMeetComputeSpec(argc, argv); loadDefaultImage(argv[0]); Pixel *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel))); char *ref_file = NULL; char dump_file[256]; int mode = 0; mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); switch (mode) { case 0: g_SobelDisplayMode = SOBELDISPLAY_IMAGE; sprintf(dump_file, "lena_orig.pgm"); break; case 1: g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; sprintf(dump_file, "lena_tex.pgm"); break; case 2: g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; sprintf(dump_file, "lena_shared.pgm"); break; default: printf("Invalid Filter Mode File\n"); exit(EXIT_FAILURE); break; } printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel)); checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost)); sdkSavePGM(dump_file, h_result, imWidth, imHeight); if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { g_TotalErrors++; } checkCudaErrors(cudaFree(d_result)); free(h_result); if (g_TotalErrors != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed!\n"); exit(EXIT_SUCCESS); }
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s) { #if (CUDA_VERSION < 5000) CV_Assert(terminals.type() == CV_32S); #else CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F); #endif Size src_size = terminals.size(); CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width)); CV_Assert(leftTransp.type() == terminals.type()); CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width)); CV_Assert(rightTransp.type() == terminals.type()); CV_Assert(top.size() == src_size); CV_Assert(top.type() == terminals.type()); CV_Assert(topLeft.size() == src_size); CV_Assert(topLeft.type() == terminals.type()); CV_Assert(topRight.size() == src_size); CV_Assert(topRight.type() == terminals.type()); CV_Assert(bottom.size() == src_size); CV_Assert(bottom.type() == terminals.type()); CV_Assert(bottomLeft.size() == src_size); CV_Assert(bottomLeft.type() == terminals.type()); CV_Assert(bottomRight.size() == src_size); CV_Assert(bottomRight.type() == terminals.type()); labels.create(src_size, CV_8U); NppiSize sznpp; sznpp.width = src_size.width; sznpp.height = src_size.height; int bufsz; nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) ); ensureSizeIsEnough(1, bufsz, CV_8U, buf); cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc); #if (CUDA_VERSION < 5000) nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(), bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(), static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) ); #else if (terminals.type() == CV_32S) { nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(), bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(), static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) ); } else { nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(), bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(), static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) ); } #endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
void DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg) { create(sizeBytes_arg); cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) ); cudaSafeCall( cudaDeviceSynchronize() ); }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice); if (precisionChoice == NULL) { useDoublePrecision = 0; } else { if (!STRCASECMP(precisionChoice, "double")) { useDoublePrecision = 1; } else { useDoublePrecision = 0; } } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s int dev = findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); int deviceIndex; checkCudaErrors(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if (useDoublePrecision && version < 13) { printf("Double precision not supported.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return 0; } printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if (useDoublePrecision) { initTable_SM13(tableCPU); } else { initTable_SM10(tableCPU); } printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { quasirandomGenerator_SM13(d_Output, 0, N); } else { quasirandomGenerator_SM10(d_Output, 0, N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); } else { inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
void DeviceMemory::download(void *host_ptr_arg) const { cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); }
void magmablas_ssymm_mgpu_spec( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, float *dA[], magma_int_t ldda, magma_int_t offset, float *dB[], magma_int_t lddb, float beta, float *dC[], magma_int_t lddc, float *dwork[], magma_int_t dworksiz, float *C, magma_int_t ldc, float *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10],magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nstream >= ngpu ); assert( nbevents >= ngpu*ngpu ); float *dwork1[MagmaMaxGPUs]; float *dwork2[MagmaMaxGPUs]; magma_int_t lddwork = lddc; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; dwork2[dev] = dwork[dev]+n*lddwork; } assert( dworksiz >= (2*n*lddwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev,devperm,myblk,mycolsize,myblkoffst; magma_int_t gdev,gcolsize,gmaster,gngpu; magma_int_t masterdev,lcdev,lccolsize,myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m,nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset),nb); magma_int_t maxgsize = n*nb*magma_ceildiv(nbblk,ngpu); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); //******************************* // each GPU make a GEMM with the // transpose of its blocks to compute // a final portion of X=A*VT //******************************* /* dB = V*T already ==> dB' = T'*V' * compute T'*V'*X is equal to compute locally (VT)'_i*X_i * then each GPU broadcast its X_i to assemble the full X which is used * to compute W = X - 0.5 * V * T'*V'*X = X - 0.5 * V *dwork3 */ if(ngpu ==1){ magma_setdevice( 0 ); magmablasSetKernelStream( streams[ 0 ][ 0 ] ); // compute X[me] = A*VT = A[me]^tr *VT; magma_sgemm( MagmaTrans, MagmaNoTrans, m, n, m, alpha, dA(0,offset,offset), ldda, dB[0], lddb, beta, dC[0], lddc ); return; } //ngpu>1 for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize,m); if(mycolsize>0){ if(masterdev==-1) masterdev = dev; //printf("dev %d devperm %d on cmplx %d master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d,%d,%d) ==> dwork(%d,%d)\n",dev,devperm,cmplxid,masterdev,nbblk,myblk,m,n,mycolsize,stdev,fstblksiz,devlstblk,remm%nb,dev,offset,myblkoffst,dev,maxgsize*dev); gpuisactive[dev] = mycolsize; magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ dev ] ); magma_sgemm( MagmaTrans, MagmaNoTrans, mycolsize, n, m, alpha, dA(dev,offset,myblkoffst), ldda, dB(dev,0,0), lddb, beta, &dwork[dev][maxgsize*dev], mycolsize ); magma_event_record(redevents[dev][dev*ngpu+dev], streams[dev][dev]); } if(dev == masterdev){ nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( streams[ dev ][ dev ] ); } */ //******************************* // each Master GPU has the final // result either by receiving // from CPU of by making the add // by himself, so now it is time // to broadcast over the GPUs of // its board. //******************************* //printf("=======================================================================\n"); //printf(" sending \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU send my portion local // to all active gpu of my cmplex and global to the // active master of the other real and they should // send it out to their actives slaves. magma_setdevice( dev ); //============================================== // sending to the master of the active real //============================================== //printf ("\n\n**************GPU %d\n ",dev); //printf (" GPU %d sending to cmplx masters\n",dev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //printf (" device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n",dev,cmplxid,gmaster,k,mycolsize,redevents[dev][gmaster*ngpu+dev]); magma_queue_wait_event(streams[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]); cudaMemcpy2DAsync(&dwork[gmaster][maxgsize*dev], mycolsize*sizeof(float), &dwork[dev][maxgsize*dev], mycolsize*sizeof(float), mycolsize*sizeof(float), n, cudaMemcpyDeviceToDevice, streams[dev][gmaster]); magma_event_record(redevents[dev][gmaster*ngpu+dev], streams[dev][gmaster]); } } } //============================================== // //============================================== // sending to the active GPUs of my real //============================================== //printf (" GPU %d sending internal\n",dev); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=dev)&&(lccolsize>0)){ //printf (" device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n",dev,cmplxid,lcdev,mycolsize,redevents[dev][lcdev*ngpu+dev]); magma_queue_wait_event(streams[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]); cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*dev], mycolsize*sizeof(float), &dwork[dev][maxgsize*dev], mycolsize*sizeof(float), mycolsize*sizeof(float), n, cudaMemcpyDeviceToDevice, streams[dev][lcdev]); magma_event_record(redevents[dev][lcdev*ngpu+dev], streams[dev][lcdev]); } } //============================================== }// end if mycolsize>0 }// for idev }// for cmplxid //printf("=======================================================================\n"); //printf(" master wait and resend internally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //============================================== // if I am active master so wait receiving contribution // of the GPUs of other real and send it locally //============================================== if(masterdev != -1){ mycolsize = gpuisactive[masterdev]; magma_setdevice( masterdev ); //printf(" GPU %d distributing internal\n",masterdev); for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // check if I received from this GPU, // if yes send it to my group if(gcolsize>0){ magma_queue_wait_event(streams[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]); for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev,cmplxid,redevents[gdev][masterdev*ngpu+gdev],gdev,lcdev,gcolsize,redevents[masterdev][lcdev*ngpu+gdev]); cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*gdev], gcolsize*sizeof(float), &dwork[masterdev][maxgsize*gdev], gcolsize*sizeof(float), gcolsize*sizeof(float), n, cudaMemcpyDeviceToDevice, streams[masterdev][gdev]); magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], streams[masterdev][gdev]); } } } } } } }// if active master //============================================== }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( streams[ dev ][ 0 ] ); for( magma_int_t s = 0; s < ngpu; ++s ) { magma_queue_sync( streams[ dev ][ s ] ); } } */ //printf("=======================================================================\n"); //printf(" distributing \n"); //printf("=======================================================================\n"); magma_int_t lcblki,gbblki,gblk,ib; for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU //printf("\n\n==============GPU %d collecting\n",dev); magma_setdevice( dev ); // collect my results first as tyhere is no need to wait to // receive nothing, just wait that my gemm are done. // in theory this should be inside the loop but cuda was not // able to run it first for all gpu and on gpu>0 it was waiting // however it was on different stream so it should run. but maybe // this is because there are too many function call and this make // cuda not handleit so nice. anyway it coul dbe removed when cuda // is able to lunch it first without wait. gdev = dev; gcolsize = gpuisactive[gdev]; if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( streams[ dev ][ gdev ] ); magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d stream %d doing slacpy\n",dev,streams[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb,m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib,m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki); magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki } for( magma_int_t k = 0; k < nbcmplx; ++k ) { gngpu = gnode[k][MagmaMaxGPUs]; for( magma_int_t g = 0; g < gngpu; ++g ) { gdev = gnode[k][g]; gcolsize = gpuisactive[gdev]; // if gcolsize>0, ==> gpu gdev was active and so // I received from him/computed a portion of dwork, // so go over its gblk and distribute it on dC. if(gdev!=dev){ if(gcolsize>0){ devperm = (gdev-stdev+ngpu)%ngpu; gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); magmablasSetKernelStream( streams[ dev ][ gdev ] ); if(k==cmplxid){ //we are on the same group so wait on event issued by gdev for me citing his id magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); //printf (" GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[gdev][dev*ngpu+gdev],gdev,gcolsize); }else{ //we are on different group so: //if I am the master wait on the event issued by gdev for me citing his id //else wait event issued by my master for me on the behalf of gdev //printf (" GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[masterdev][dev*ngpu+gdev],gdev,gcolsize); if(dev==masterdev) magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]); else magma_queue_wait_event(streams[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]); } //printf (" GPU %d stream %d doing slacpy\n",dev,streams[ dev ][ gdev ]); for( magma_int_t blki = 0; blki < gblk; ++blki){ gbblki = (blki*ngpu + devperm)*nb - blockoffset; lcblki = blki*nb; ib = nb;//min(nb,m-gbblki); if(gdev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } ib = min(ib,m-gbblki); //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki); magmablas_slacpy( MagmaFull, ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc); }// end blki }// en gcolsize>0 meaning gdev is active } // end if gdev != dev }// end loop over the g gpus of the cmplx k }//end loop over the real k }// end mycolsize>0 meaning that I am active }// end loop over idev of cmplxid }// end loop of the cmplx for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); cudaDeviceSynchronize(); } // put back the input gpu and its input stream magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
int main (int argc, char **argv){ unsigned int size = 4000; unsigned int interval = 4000; int *data = (int*)malloc(size * sizeof(int)); int threshold = 127; srand(0); unsigned int count = 0; int val; for (unsigned int s = 0; s < size; ++s) { val = rand() % (2*threshold + 1); if (val > threshold) { data[s] = 1; ++count; } else { data[s] = 0; } } int * output = (int*)malloc(size * sizeof(int)); unsigned int total = 0; cudaSetDevice(0); cudaStream_t stream; cudaError_t error; error = cudaStreamCreate(&stream); // now do the tests // CPU long t1, t2; t1 = ClockGetTime3(); total = nscale::gpu::SelectCPUTesting(data, size, output ); t2 = ClockGetTime3(); printf("cpu: %d total, %lu ms\n", total, t2-t1); // thrust for (unsigned int s = 0; s < size; s++) { if ((s % (size / interval)) == 0) { printf("%d, ", data[s]); } } printf("\n"); t1 = ClockGetTime3(); total = nscale::gpu::SelectThrustScanTesting(data, size, output, stream); error = cudaStreamSynchronize(stream); t2 = ClockGetTime3(); printf("thrust scan: %d total, %lu ms\n", total, t2-t1); for (unsigned int s = 0; s < size; s++) { if ((s % (size / interval)) == 0) { printf("%d, ", output[s]); } } printf("\n"); cudaDeviceSynchronize(); // warp scan unordered // for (unsigned int s = 0; s < size; s++) { // if ((s % 10000) == 0) { // printf("%d, ", data[s]); // } // } t1 = ClockGetTime3(); total = nscale::gpu::SelectWarpScanUnorderedTesting(data, size, output, stream); // error = cudaStreamSynchronize(stream); // cudaDeviceSynchronize(); t2 = ClockGetTime3(); printf("warp scan unordered: %d total, %lu ms\n", total, t2-t1); for (unsigned int s = 0; s < size; s++) { if ((s % (size / interval)) == 0) { printf("%d, ", output[s]); } } printf("\n"); // cudaDeviceSynchronize(); int count2; // warp scan ordered t1 = ClockGetTime3(); total = nscale::gpu::SelectWarpScanOrderedTesting(data, size, output, stream); // cudaDeviceSynchronize(); t2 = ClockGetTime3(); printf("warp scan ordered: %d total, %lu ms\n", total, t2-t1); for (unsigned int s = 0; s < size; s++) { if ((s % (size / interval)) == 0) { printf("%d, ", output[s]); } } printf("\n"); error = cudaStreamDestroy(stream); free(data); free(output); return 0; }
int main( int argc, char **argv ) { printf("Starting\n"); int size; cudaError_t cudaStat; magma_err_t magmaStat; cublasStatus_t stat; cublasHandle_t handle; int it,i; cublasOperation_t N = 'N'; cublasOperation_t T = 'T'; char N2 = 'N'; char T2 = 'T'; double one = 1., zero=0.; char uplo = 'L'; int info; int err; double* A; double* B; magmaStat = magma_init(); int use_pinned; if(argc > 1) { use_pinned = atoi(argv[1]); } else use_pinned = 0; printf("Setting use_pinned to %d\n", use_pinned); for( size = 256; size <= 8192; size*=2 ) { if(use_pinned) { // allocate pinned memory on CPU err = magma_dmalloc_pinned( &A, size*size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size*size ); assert( err == 0 ); } else { // allocate standard memory on CPU A = (double*) malloc( sizeof(double)*size*size ); B = (double*) malloc( sizeof(double)*size*size ); } cudaDeviceSynchronize(); double tInit = read_timer(); double *dA,*dB; // allocate memory on GPU magma_malloc( (void**) &dA, sizeof(double)*size*size ); magma_malloc( (void**) &dB, sizeof(double)*size*size ); cudaDeviceSynchronize(); double tAlloc = read_timer(); fillMatrix(B, size*size); cudaDeviceSynchronize(); double tInit2 = read_timer(); // transfer data to GPU magma_dsetmatrix( size, size, B, size, dB, size ); cudaDeviceSynchronize(); double tTransferToGPU = read_timer(); // matrix multiply magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size ); // magma_dgemm is apparently synonymous with magmablas_dgemm cudaDeviceSynchronize(); double tMatMult = read_timer(); // Cholesky decomposition on GPU with GPU interface (called with object on GPU) magma_dpotrf_gpu( 'L', size, dA, size, &info ); cudaDeviceSynchronize(); double tChol = read_timer(); // transfer data back to CPU magma_dgetmatrix( size, size, dA, size, A, size ); cudaDeviceSynchronize(); double tTransferFromGPU = read_timer(); // standard BLAS matrix multiply on CPU dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tMatMultBlas = read_timer(); // Cholesky decomposition on GPU with CPU interface (called with object on CPU) magma_dpotrf( 'L', size, A, size, &info ); cudaDeviceSynchronize(); double tCholCpuInterface = read_timer(); // recreate A = B * B (could just do a save and copy instead....) dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tInit3 = read_timer(); // standard Lapack Cholesky decomposition on CPU dpotrf_(&uplo, &size, A, &size, &info); cudaDeviceSynchronize(); double tCholCpu= read_timer(); printf("====================================================\n"); printf("Timing results for n = %d\n", size); printf("GPU memory allocation time: %f\n", tAlloc - tInit); printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU); printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU); printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult); printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas); printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3); printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol); if(use_pinned) { magma_free_pinned(A); magma_free_pinned(B); } else { free(A); free(B); } magma_free(dA); magma_free(dB); } return EXIT_SUCCESS; }
double do_compute_and_probe(double seconds, MPI_Request* request) { double t1 = 0.0, t2 = 0.0; double test_time = 0.0; int num_tests = 0; double target_seconds_for_compute = 0.0; int flag = 0; MPI_Status status; if (options.num_probes) { target_seconds_for_compute = (double) seconds/options.num_probes; if (DEBUG) fprintf(stderr, "setting target seconds to %f\n", (target_seconds_for_compute * 1e6 )); } else { target_seconds_for_compute = seconds; if (DEBUG) fprintf(stderr, "setting target seconds to %f\n", (target_seconds_for_compute * 1e6 )); } #ifdef _ENABLE_CUDA_KERNEL_ if (options.target == gpu) { if (options.num_probes) { /* Do the dummy compute on GPU only */ do_compute_gpu(target_seconds_for_compute); num_tests = 0; while (num_tests < options.num_probes) { t1 = MPI_Wtime(); MPI_Test(request, &flag, &status); t2 = MPI_Wtime(); test_time += (t2-t1); num_tests++; } } else { do_compute_gpu(target_seconds_for_compute); } } else if (options.target == both) { if (options.num_probes) { /* Do the dummy compute on GPU and CPU*/ do_compute_gpu(target_seconds_for_compute); num_tests = 0; while (num_tests < options.num_probes) { t1 = MPI_Wtime(); MPI_Test(request, &flag, &status); t2 = MPI_Wtime(); test_time += (t2-t1); num_tests++; do_compute_cpu(target_seconds_for_compute); } } else { do_compute_gpu(target_seconds_for_compute); do_compute_cpu(target_seconds_for_compute); } } else #endif if (options.target == cpu) { if (options.num_probes) { num_tests = 0; while (num_tests < options.num_probes) { do_compute_cpu(target_seconds_for_compute); t1 = MPI_Wtime(); MPI_Test(request, &flag, &status); t2 = MPI_Wtime(); test_time += (t2-t1); num_tests++; } } else { do_compute_cpu(target_seconds_for_compute); } } #ifdef _ENABLE_CUDA_KERNEL_ if (options.target == gpu || options.target == both) { cudaDeviceSynchronize(); cudaStreamDestroy(stream); } #endif return test_time; }
/* * main should only control threads * * the threads should be invoked on different cores: * http://stackoverflow.com/questions/1407786/how-to-set-cpu-affinity-of-a-particular-pthread * https://www.google.pl/search?client=ubuntu&channel=fs&q=how+to+schedule+pthreads+through+cores&ie=utf-8&oe=utf-8&gfe_rd=cr&ei=PSudVePFOqeA4AShra2AAQ */ int main() { cudaDeviceReset(); cudaDeviceSynchronize(); // print device properties print_device(); // create pointers to data const uint64_t size = N; double complex* data_r_host = NULL; // initializing with NULL for debuging purposes double complex* data_k_host = NULL; // initializing with NULL for debuging purposes DataArray* data_arr_ptr = (DataArray*) malloc((size_t) sizeof(DataArray)); // change to global variable <- easier to code create_data_arr(data_arr_ptr, &data_r_host, &data_k_host, size); // allocate memory for array of streams const uint8_t num_streams = 2; // rewrite on defines? streams_arr = (cudaStream_t*) malloc( (size_t) sizeof(cudaStream_t)*num_streams); // create threads const uint8_t num_threads = 2; printf("host thread id\t %u\ndevice thread id %u\n",HOST_THRD, DEVICE_THRD); pthread_t* thread_ptr_arr = (pthread_t*) malloc( (size_t) sizeof(pthread_t)*num_threads ); // alternatively pthread_t* thread_ptr_arr[num_threads]; // init barier for threads pthread_barrier_init (&barrier, NULL, num_threads); // last number tells how many threads should be synchronized by this barier pthread_create(&thread_ptr_arr[HOST_THRD], NULL, host_thread, (void*) data_arr_ptr); pthread_create(&thread_ptr_arr[DEVICE_THRD], NULL, device_thread, (void*) data_arr_ptr); // for (uint8_t ii = 0; ii < num_threads; ii++) { // pthread_create(thread_ptr_arr[ii], NULL, host_thread, (void*) data_arr_ptr); // } //cudaStream_t stream1; //cudaStream_t stream2; //cudaStream_t* streams_arr[2] = {&stream1, &stream2}; void* status; pthread_join(thread_ptr_arr[HOST_THRD], &status); pthread_join(thread_ptr_arr[DEVICE_THRD], &status); printf("data visible in main thread:\n"); /*for (uint64_t ii=0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) { printf( "%lu.\t",ii ); printf( "%lf + %lf\t", creal(data_r_host[ii]), cimag(data_r_host[ii]) ); printf( "%lf + %lf\n", creal(data_k_host[ii]), cimag(data_k_host[ii]) ); }*/ free(thread_ptr_arr); free(streams_arr); free_data_arr(data_arr_ptr); cudaDeviceSynchronize(); free(data_arr_ptr); cudaThreadExit(); cudaDeviceSynchronize(); printf("Main: program completed. Exiting...\n"); return EXIT_SUCCESS; }
void TEMPLATE2 (CHOLMOD (gpu_final_assembly)) ( cholmod_common *Common, double *Lx, Int psx, Int nscol, Int nsrow, int supernodeUsedGPU, int *iHostBuff, int *iDevBuff, cholmod_gpu_pointers *gpu_p ) { Int iidx, i, j; Int iHostBuff2 ; Int iDevBuff2 ; if ( supernodeUsedGPU ) { /* ------------------------------------------------------------------ */ /* Apply all of the Shur-complement updates, computed on the gpu, to */ /* the supernode. */ /* ------------------------------------------------------------------ */ *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { /* If this supernode is going to be factored using the GPU (potrf) * then it will need the portion of the update assembled ont the * CPU. So copy that to a pinned buffer an H2D copy to device. */ /* wait until a buffer is free */ cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] ); /* copy update assembled on CPU to a pinned buffer */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if (nscol>32) for ( j=0; j<nscol; j++ ) { for ( i=j; i<nsrow*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx]; } } /* H2D transfer of update assembled on CPU */ cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff], nscol*nsrow*L_ENTRY*sizeof(double), cudaMemcpyHostToDevice, Common->gpuStream[*iDevBuff] ); } Common->ibuffer++; iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* wait for all kernels to complete */ cudaEventSynchronize( Common->updateCKernelsComplete ); /* copy assembled Schur-complement updates computed on GPU */ cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0], nscol*nsrow*L_ENTRY*sizeof(double), cudaMemcpyDeviceToHost, Common->gpuStream[iDevBuff2] ); if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { /* with the current implementation, potrf still uses data from the * CPU - so put the fully assembled supernode in a pinned buffer for * fastest access */ /* need both H2D and D2H copies to be complete */ cudaDeviceSynchronize(); /* sum updates from cpu and device on device */ #ifdef REAL sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol ); #else sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol ); #endif /* place final assembled supernode in pinned buffer */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if (nscol>32) for ( j=0; j<nscol; j++ ) { for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; gpu_p->h_Lx[*iHostBuff][iidx] -= gpu_p->h_Lx[iHostBuff2][iidx]; } } } else { /* assemble with CPU updates */ cudaDeviceSynchronize(); #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if (nscol>32) for ( j=0; j<nscol; j++ ) { for ( i=j*L_ENTRY; i<nsrow*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx]; } } } } return; }
static int cutorch_synchronize(lua_State *L) { cudaDeviceSynchronize(); return 0; }
int _tmain(int argc, _TCHAR* argv[]) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; //PreProcess const std::string *filename = new std::string("./cinque_terre_small.jpg"); cv::Mat imageInputRGBA; cv::Mat imageOutputRGBA; //make sure the context initializes ok checkCudaErrors(cudaFree(0)); cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR); if (image.empty()) { std::cerr << "Couldn't open file: " << filename << std::endl; cv::waitKey(0); exit(1); } cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); //allocate memory for the output imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); //This shouldn't ever happen given the way the images are created //at least based upon my limited understanding of OpenCV, but better to check if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { std::cerr << "Images aren't continuous!! Exiting." << std::endl; exit(1); } h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0); h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0); const size_t numPixels = image.rows * image.cols; //allocate memory on the device for both input and output checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around //copy input array to the GPU checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); //now create the filter that they will use const int blurKernelWidth = 9; const float blurKernelSigma = 2.; filterWidth = blurKernelWidth; //create and fill the filter we will convolve with h_filter = new float[blurKernelWidth * blurKernelWidth]; float filterSum = 0.f; //for normalization for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; filterSum += filterValue; } } float normalizationFactor = 1.f / filterSum; for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; } } //blurred checkCudaErrors(cudaMalloc(&d_redBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(&d_greenBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(&d_blueBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_redBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth); GpuTimer timer; timer.Start(); //call the students' code your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols, d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("%f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } cleanup(); //check results and output the blurred image //PostProcess //copy the output back to the host checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); cv::Mat imageOutputBGR; cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR); //output the image cv::imwrite("./blurredResult.jpg", imageOutputBGR); cv::namedWindow( "Display window", CV_WINDOW_NORMAL); cv::imshow("Display window", imageOutputBGR); cv::waitKey(0); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); return 0; }