int main(int argc,char **argv){ // Print GPU properties //print_properties(); // Files to print the result after the last time step FILE *rho_file; FILE *E_file; rho_file = fopen("rho_final.txt", "w"); E_file = fopen("E_final.txt", "w"); // Construct initial condition for problem ICsinus Config(-1.0, 1.0, -1.0, 1.0); //ICsquare Config(0.5,0.5,gasGam); // Set initial values for Configuration 1 /* Config.set_rho(rhoConfig19); Config.set_pressure(pressureConfig19); Config.set_u(uConfig19); Config.set_v(vConfig19); */ // Determining global border based on left over tiles (a little hack) int globalPadding; globalPadding = (nx+2*border+16)/16; globalPadding = 16*globalPadding - (nx+2*border); //printf("Globalpad: %i\n", globalPadding); // Change border to add padding //border = border + globalPadding/2; // Initiate the matrices for the unknowns in the Euler equations cpu_ptr_2D rho(nx, ny, border,1); cpu_ptr_2D E(nx, ny, border,1); cpu_ptr_2D rho_u(nx, ny, border,1); cpu_ptr_2D rho_v(nx, ny, border,1); cpu_ptr_2D zeros(nx, ny, border,1); // Set initial condition Config.setIC(rho, rho_u, rho_v, E); double timeStart = get_wall_time(); // Test cpu_ptr_2D rho_dummy(nx, ny, border); cpu_ptr_2D E_dummy(nx, ny, border); /* rho_dummy.xmin = -1.0; rho_dummy.ymin = -1.0; E_dummy.xmin = -1.0; E_dummy.ymin = -1.0; */ // Set block and grid sizes dim3 gridBC = dim3(1, 1, 1); dim3 blockBC = dim3(BLOCKDIM_BC,1,1); dim3 gridBlockFlux; dim3 threadBlockFlux; dim3 gridBlockRK; dim3 threadBlockRK; computeGridBlock(gridBlockFlux, threadBlockFlux, nx + 2*border, ny + 2*border, INNERTILEDIM_X, INNERTILEDIM_Y, BLOCKDIM_X, BLOCKDIM_Y); computeGridBlock(gridBlockRK, threadBlockRK, nx + 2*border, ny + 2*border, BLOCKDIM_X_RK, BLOCKDIM_Y_RK, BLOCKDIM_X_RK, BLOCKDIM_Y_RK); int nElements = gridBlockFlux.x*gridBlockFlux.y; // Allocate memory for the GPU pointers gpu_ptr_1D L_device(nElements); gpu_ptr_1D dt_device(1); gpu_ptr_2D rho_device(nx, ny, border); gpu_ptr_2D E_device(nx, ny, border); gpu_ptr_2D rho_u_device(nx, ny, border); gpu_ptr_2D rho_v_device(nx, ny, border); gpu_ptr_2D R0(nx, ny, border); gpu_ptr_2D R1(nx, ny, border); gpu_ptr_2D R2(nx, ny, border); gpu_ptr_2D R3(nx, ny, border); gpu_ptr_2D Q0(nx, ny, border); gpu_ptr_2D Q1(nx, ny, border); gpu_ptr_2D Q2(nx, ny, border); gpu_ptr_2D Q3(nx, ny, border); // Allocate pinned memory on host init_allocate(); // Set BC arguments set_bc_args(BCArgs[0], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border); set_bc_args(BCArgs[1], Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx+2*border, ny+2*border, border); set_bc_args(BCArgs[2], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border); // Set FLUX arguments set_flux_args(fluxArgs[0], L_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y); set_flux_args(fluxArgs[1], L_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y); // Set TIME argument set_dt_args(dtArgs, L_device.getRawPtr(), dt_device.getRawPtr(), nElements, rho.get_dx(), rho.get_dy(), cfl_number); // Set Rk arguments set_rk_args(RKArgs[0], dt_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx, ny, border); set_rk_args(RKArgs[1], dt_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx, ny, border); L_device.set(FLT_MAX); /* R0.upload(zeros.get_ptr()); R1.upload(zeros.get_ptr()); R2.upload(zeros.get_ptr()); R3.upload(zeros.get_ptr()); Q0.upload(zeros.get_ptr()); Q1.upload(zeros.get_ptr()); Q2.upload(zeros.get_ptr()); Q3.upload(zeros.get_ptr()); */ R0.set(0,0,0,nx,ny,border); R1.set(0,0,0,nx,ny,border); R2.set(0,0,0,nx,ny,border); R3.set(0,0,0,nx,ny,border); Q0.set(0,0,0,nx,ny,border); Q1.set(0,0,0,nx,ny,border); Q2.set(0,0,0,nx,ny,border); Q3.set(0,0,0,nx,ny,border); rho_device.upload(rho.get_ptr()); rho_u_device.upload(rho_u.get_ptr()); rho_v_device.upload(rho_v.get_ptr()); E_device.upload(E.get_ptr()); // Update boudries callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[0]); //Create cuda stream cudaStream_t stream1; cudaStreamCreate(&stream1); cudaEvent_t dt_complete; cudaEventCreate(&dt_complete); while (currentTime < timeLength && step < maxStep){ //RK1 //Compute flux callFluxKernel(gridBlockFlux, threadBlockFlux, 0, fluxArgs[0]); // Compute timestep (based on CFL condition) callDtKernel(TIMETHREADS, dtArgs); cudaMemcpyAsync(dt_host, dt_device.getRawPtr(), sizeof(float), cudaMemcpyDeviceToHost, stream1); cudaEventRecord(dt_complete, stream1); // Perform RK1 step callRKKernel(gridBlockRK, threadBlockRK, 0, RKArgs[0]); //Update boudries callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[1]); //RK2 // Compute flux callFluxKernel(gridBlockFlux, threadBlockFlux, 1, fluxArgs[1]); //Perform RK2 step callRKKernel(gridBlockRK, threadBlockRK, 1, RKArgs[1]); //cudaEventRecord(srteam_sync, srteam1); callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[2]); cudaEventSynchronize(dt_complete); step++; currentTime += *dt_host; // printf("Step: %i, current time: %.6f dt:%.6f\n" , step,currentTime, dt_host[0]); } //cuProfilerStop(); //cudaProfilerStop(); printf("Elapsed time %.5f", get_wall_time() - timeStart); E_device.download(E.get_ptr()); rho_u_device.download(rho_u.get_ptr()); rho_v_device.download(rho_v.get_ptr()); rho_device.download(rho_dummy.get_ptr()); rho_dummy.printToFile(rho_file, true, false); Config.exactSolution(E_dummy, currentTime); E_dummy.printToFile(E_file, true, false); float LinfError = Linf(E_dummy, rho_dummy); float L1Error = L1(E_dummy, rho_dummy); float L1Error2 = L1test(E_dummy, rho_dummy); printf("nx: %i\t Linf error %.9f\t L1 error %.7f L1test erro %.7f", nx, LinfError, L1Error, L1Error2); printf("nx: %i step: %i, current time: %.6f dt:%.6f\n" , nx, step,currentTime, dt_host[0]); /* cudaMemcpy(L_host, L_device, sizeof(float)*(nElements), cudaMemcpyDeviceToHost); for (int i =0; i < nElements; i++) printf(" %.7f ", L_host[i]); */ printf("%s\n", cudaGetErrorString(cudaGetLastError())); return(0); }
void ControlCubeCache::_reSizeCache() { _nLevels = _nextnLevels; _levelCube = _nextLevelCube; _offset = _nextOffset; _nextnLevels = 0; _nextLevelCube = 0; _dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC; _sizeElement = pow(_dimCube, 3); int dimV = exp2(_nLevels); _minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels); _maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels); int dc = exp2(_nLevels - _levelCube); vmml::vector<3,int> mn = _cpuCache->getMinCoord(); vmml::vector<3,int> mx = _cpuCache->getMaxCoord(); _maxC = mx - mn; if ((mx.x() - mn.x()) % dc != 0) _maxC[0] += dc; if ((mx.y() - mn.y()) % dc != 0) _maxC[1] += dc; if ((mx.z() - mn.z()) % dc != 0) _maxC[2] += dc; if (cudaSuccess != cudaSetDevice(_device)) { std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } if (_memory != 0) if (cudaSuccess != cudaFree((void*)_memory)) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } size_t total = 0; size_t free = 0; if (cudaSuccess != cudaMemGetInfo(&free, &total)) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } float memorySize = (0.80f*free); // Get 80% of free memory _maxNumCubes = memorySize/ (_sizeElement*sizeof(float)); if (_maxNumCubes == 0) { std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl; throw; } if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float))) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } _freeSlots = _maxNumCubes; ControlElementCache::_reSizeCache(); }
int APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, CudaNdarray *km, cudnnConvolutionDescriptor_t desc, float alpha, float beta, CudaNdarray **kerns) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size\n"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; int nb_dim = CudaNdarray_NDIM(output); #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0) return 1; if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km)) return 1; #endif if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; { size_t worksize; void *workspace; cudnnConvolutionBwdFilterAlgo_t chosen_algo; if (CHOOSE_ALGO) { // A new convolution implementation should be selected, based either on // timing or heuristics, if in one of the two following cases : // - The implementation should only be chosen during the first execution // of an apply node and this is the first execution of the apply node. // - The implementation should be chosen as often as necessary and the // shapes of the inputs differ from the last time an implementation // was chosen. bool reuse_previous_algo; if (CHOOSE_ALGO_ONCE) { // Only choose a new implementation of none has been chosen before. reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set); } else { // Reuse the previous implementation if the the kernels and the outputs // have the same shapes as they had when the previous implementation // was selected bool same_shapes = true; for (int i = 0; (i < nb_dim) && same_shapes; i++) { same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] == APPLY_SPECIFIC(previous_input_shape)[i]); same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] == APPLY_SPECIFIC(previous_output_shape)[i]); } reuse_previous_algo = same_shapes; } // If the previously choosen implementation can't be reused, select a // new one based on the shapes of the current inputs if (!reuse_previous_algo) { // Obtain a convolution algorithm appropriate for the input and output // shapes. Either by choosing one according to heuristics or by making // cuDNN time every implementation and choose the best one. if (CHOOSE_ALGO_TIME) { // Time the different implementations to choose the best one int requestedCount = 1; int count; cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf; err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), requestedCount, &count, &choosen_algo_perf); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error selecting convolution algo: " "%s", cudnnGetErrorString(err)); return 1; } chosen_algo = choosen_algo_perf.algo; } else { // Choose the convolution implementation using heuristics based on the // shapes of the inputs and the amount of memory available. // Get the amount of available memory size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); fprintf(stderr, "Error when trying to find the memory information" " on the GPU: %s\n", cudaGetErrorString(err2)); return 1; } // Use heuristics to choose the implementation err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &chosen_algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } } // Store the shapes of the inputs and kernels as well as the chosen // algorithm for future use. APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo_set) = true; for (int i = 0; i < nb_dim; i++) { APPLY_SPECIFIC(previous_input_shape)[i] = CudaNdarray_HOST_DIMS(input)[i]; APPLY_SPECIFIC(previous_output_shape)[i] = CudaNdarray_HOST_DIMS(output)[i]; } } else { // Reuse the previously chosen convlution implementation chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo); } } else { chosen_algo = CONV_ALGO; } // The FFT implementation (only in v3 and onward) does not support strides, // 1x1 filters or inputs with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can be used // on the current data and default on a safe implementation if it // can't. if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error getting convolution properties: %s", cudnnGetErrorString(err)); return 1; } // Extract the spatial size of the filters int filter_h = CudaNdarray_HOST_DIMS(*kerns)[2]; int filter_w = CudaNdarray_HOST_DIMS(*kerns)[3]; // Extract the spatial size of the input int input_h = CudaNdarray_HOST_DIMS(input)[2]; int input_w = CudaNdarray_HOST_DIMS(input)[3]; // Ensure that the selected implementation supports the requested // convolution. Fall back to a safe implementation otherwise. if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 || input_w > 1024 || (filter_h == 1 && filter_w == 1)) { chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } // Infer required workspace size from the chosen implementation err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), chosen_algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error getting worksize: %s", cudnnGetErrorString(err)); return 1; } // Allocate workspace for the convolution workspace = get_work_mem(worksize); if (workspace == NULL && worksize != 0) return 1; // Perform the convolution err = cudnnConvolutionBackwardFilter( _handle, (void *)&alpha, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), desc, chosen_algo, workspace, worksize, (void *)&beta, APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns)); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory() { trace_printf("MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory()\n"); /** * Error variable - stores the result of the various mallocs & such. */ cudaError_t err, err2; /** * Flags for calling cudaHostMalloc - will be set to cudaHostAllocMapped * if we are mapping memory to the host with zero copy. */ unsigned int cudaHostMallocFlags = 0; if (this->useZeroCopy) { cudaHostMallocFlags |= cudaHostAllocMapped; } /* * Malloc the device hashlist space. This is the number of available hashes * times the hash length in bytes. The data will be copied later. */ err = cudaMalloc((void **)&this->DeviceHashlistAddress, this->activeHashesProcessed.size() * this->hashLengthBytes); if (err != cudaSuccess) { printf("Unable to allocate %d bytes for device hashlist! Exiting!\n", this->activeHashesProcessed.size() * this->hashLengthBytes); printf("return code: %d\n", err); exit(1); } /* * Allocate the host/device space for the success list (flags for found passwords). * This is a byte per password. To avoid atomic write issues, each password * gets a full addressible byte, and the GPU handles the dependencies between * multiple threads trying to set a flag in the same segment of memory. * * On the host, it will be allocated as mapped memory if we are using zerocopy. * * As this region of memory is frequently copied back to the host, mapping it * improves performance. In theory. */ err = cudaHostAlloc((void **)&this->HostSuccessAddress, this->activeHashesProcessed.size(), cudaHostMallocFlags); if (err != cudaSuccess) { printf("Unable to allocate %d bytes for success flags! Exiting!\n", this->activeHashesProcessed.size()); printf("return code: %d\n", err); exit(1); } // Clear host success flags region - if we are mapping the memory, the GPU // will directly write this. memset(this->HostSuccessAddress, 0, this->activeHashesProcessed.size()); // Allocate memory for the reported flags. this->HostSuccessReportedAddress = new uint8_t [this->activeHashesProcessed.size()]; memset(this->HostSuccessReportedAddress, 0, this->activeHashesProcessed.size()); // If zero copy is in use, get the device pointer for the success data, else // malloc a region of memory on the device. if (this->useZeroCopy) { err = cudaHostGetDevicePointer((void **)&this->DeviceSuccessAddress, this->HostSuccessAddress, 0); err2 = cudaSuccess; } else { err = cudaMalloc((void **)&this->DeviceSuccessAddress, this->activeHashesProcessed.size()); err2 = cudaMemset(this->DeviceSuccessAddress, 0, this->activeHashesProcessed.size()); } if ((err != cudaSuccess) || (err2 != cudaSuccess)) { printf("Unable to allocate %d bytes for device success list! Exiting!\n", this->activeHashesProcessed.size()); printf("return code: %d\n", err); printf("return code: %d\n", err2); exit(1); } /* * Allocate memory for the found passwords. As this is commonly copied * back and forth, it will be made zero copy if requested. * * This requires (number hashes * passwordLength) bytes of data. */ err = cudaHostAlloc((void **)&this->HostFoundPasswordsAddress, this->passwordLength * this->activeHashesProcessed.size() , cudaHostMallocFlags); if (err != cudaSuccess) { printf("Unable to allocate %d bytes for host password list! Exiting!\n", this->passwordLength * this->activeHashesProcessed.size()); printf("return code: %d\n", err); exit(1); } // Clear the host found password space. memset(this->HostFoundPasswordsAddress, 0, this->passwordLength * this->activeHashesProcessed.size()); if (this->useZeroCopy) { err = cudaHostGetDevicePointer((void **)&this->DeviceFoundPasswordsAddress, this->HostFoundPasswordsAddress, 0); err2 = cudaSuccess; } else { err = cudaMalloc((void **)&this->DeviceFoundPasswordsAddress, this->passwordLength * this->activeHashesProcessed.size()); err2 = cudaMemset(this->DeviceFoundPasswordsAddress, 0, this->passwordLength * this->activeHashesProcessed.size()); } if ((err != cudaSuccess) || (err2 != cudaSuccess)) { printf("Unable to allocate %d bytes for device password list! Exiting!\n", this->passwordLength * this->activeHashesProcessed.size()); printf("return code: %d\n", err); printf("return code: %d\n", err2); exit(1); } /** * Allocate space for host and device start positions. To improve performance, * this space is now aligned for improved coalescing performance. All the * position 0 elements are together, followed by all the position 1 elements, * etc. * * This memory can be allocated as write combined, as it is not read by * the host ever - only written. Since it is regularly transferred to the * GPU, this should help improve performance. */ err = cudaHostAlloc((void**)&this->HostStartPointAddress, this->TotalKernelWidth * this->passwordLength, cudaHostAllocWriteCombined | cudaHostMallocFlags); err2 = cudaMalloc((void **)&this->DeviceStartPointAddress, this->TotalKernelWidth * this->passwordLength); if ((err != cudaSuccess) || (err2 != cudaSuccess)) { printf("Unable to allocate %d bytes for host/device startpos list! Exiting!\n", this->TotalKernelWidth * this->passwordLength); printf("return code: %d\n", err); printf("return code: %d\n", err2); exit(1); } /** * Allocate space for the device start password values. This is a copy of * the MFNHashTypePlain::HostStartPasswords32 vector for the GPU. */ err = cudaMalloc((void **)&this->DeviceStartPasswords32Address, this->TotalKernelWidth * this->passwordLengthWords); if ((err != cudaSuccess)) { printf("Unable to allocate %d bytes for host/device startpos list! Exiting!\n", this->TotalKernelWidth * this->passwordLengthWords); printf("return code: %d\n", err); exit(1); } /** * Finally, attempt to allocate space for the giant device bitmaps. There * are 4x128MB bitmaps, and any number can be allocated. If they are not * fully allocated, their address is set to null as a indicator to the device * that there is no data present. Attempt to allocate as many as possible. * * This will be accessed regularly, so should probably not be zero copy. * Also, I'm not sure how mapping host memory into multiple threads would * work. Typically, if the GPU doesn't have enough RAM for the full * set of bitmaps, it's a laptop, and therefore may be short on host RAM * for the pinned access. * * If there is an error in allocation, call cudaGetLastError() to clear it - * we know there has been an error, and do not want it to persist. */ err = cudaMalloc((void **)&this->DeviceBitmap128mb_a_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap A\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap A\n"); this->DeviceBitmap128mb_a_Address = 0; cudaGetLastError(); } err = cudaMalloc((void **)&this->DeviceBitmap128mb_b_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap B\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap B\n"); this->DeviceBitmap128mb_b_Address = 0; cudaGetLastError(); } err = cudaMalloc((void **)&this->DeviceBitmap128mb_c_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap C\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap C\n"); this->DeviceBitmap128mb_c_Address = 0; cudaGetLastError(); } err = cudaMalloc((void **)&this->DeviceBitmap128mb_d_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap D\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap D\n"); this->DeviceBitmap128mb_d_Address = 0; cudaGetLastError(); } //printf("Thread %d memory allocated successfully\n", this->threadId); }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } #endif size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
cudaError_t GridGpu::retrieveData(complexVector &gData) { gData.resize(m_gridSize * m_gridSize); cudaMemcpy(gData.data(), m_d_gData, gData.size() * sizeof(complexGpu), cudaMemcpyDeviceToHost); return cudaGetLastError(); }
void MFNHashTypePlainCUDA::freeThreadAndDeviceMemory() { trace_printf("MFNHashTypePlainCUDA::freeThreadAndDeviceMemory()\n"); cudaError_t err; // Free all the memory, then look for errors. cudaFree((void *)this->DeviceHashlistAddress); cudaFreeHost((void *)this->HostSuccessAddress); delete[] this->HostSuccessReportedAddress; // Only cudaFree if zeroCopy is in use. if (!this->useZeroCopy) { cudaFree((void *)this->DeviceSuccessAddress); cudaFree((void *)this->DeviceFoundPasswordsAddress); } cudaFreeHost((void *)this->HostFoundPasswordsAddress); cudaFreeHost((void*)this->HostStartPointAddress); cudaFree((void *)this->DeviceStartPointAddress); cudaFree((void *)this->DeviceStartPasswords32Address); // Free salted hashes if in use. if (this->hashAttributes.hashUsesWordlist) { cudaFree((void *)this->DeviceWordlistBlocks); cudaFree((void *)this->DeviceWordlistLengths); } if (this->hashAttributes.hashUsesSalt) { cudaFree((void *)this->DeviceSaltLengthsAddress); cudaFree((void *)this->DeviceSaltValuesAddress); } // Only free the bitmap memory if it has been allocated. if (this->DeviceBitmap256kb_Address) { cudaFree((void *)this->DeviceBitmap256kb_Address); this->DeviceBitmap256kb_Address = 0; } if (this->DeviceBitmap128mb_a_Address) { cudaFree((void *)this->DeviceBitmap128mb_a_Address); this->DeviceBitmap128mb_a_Address = 0; } if (this->DeviceBitmap128mb_b_Address) { cudaFree((void *)this->DeviceBitmap128mb_b_Address); this->DeviceBitmap128mb_b_Address = 0; } if (this->DeviceBitmap128mb_c_Address) { cudaFree((void *)this->DeviceBitmap128mb_c_Address); this->DeviceBitmap128mb_c_Address = 0; } if (this->DeviceBitmap128mb_d_Address) { cudaFree((void *)this->DeviceBitmap128mb_d_Address); this->DeviceBitmap128mb_d_Address = 0; } // Get any error that occurred above and report it. err = cudaGetLastError(); if (err != cudaSuccess) { printf("Thread %d: CUDA error freeing memory: %s. Exiting.\n", this->threadId, cudaGetErrorString( err)); exit(1); } }
/** check whether cuda thinks there was an error and fail with msg, if this is the case * @ingroup tools */ static inline void checkCudaError(const char *msg) { cudaError_t err = cudaGetLastError(); if (cudaSuccess != err) { throw std::runtime_error(std::string(msg) + ": " + cudaGetErrorString(err)); } }
int main( int argc,char** argv) { printf("hello world\n"); if (!InitCUDA()) { return 0; } int iter = 1000; int trainnum = 20; bool isProfiler = false; int intProfiler = 0; int testnum = -1; float maxtime = 0.0f; cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum); cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter); cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler); cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum); cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime); printf("%d\n", intProfiler); if(intProfiler) { isProfiler = true; } if(testnum == -1) testnum = trainnum /2; printf("Iter = %d\n", iter); printf("TrainNum = %d\n", trainnum); printf("TestNum = %d\n", testnum); CUT_DEVICE_INIT(argc, argv); cublasStatus status; status = cublasInit(); if(status != CUBLAS_STATUS_SUCCESS) { printf("Can't init cublas\n"); printf("%s\n", cudaGetErrorString(cudaGetLastError())); return -1; } Image* imageList = new Image[trainnum+testnum]; read64("my_optdigits.tra", imageList, trainnum + testnum); const int warmUpTime = 3; if(!isProfiler) { freopen("verbose.txt", "w", stdout); for(int i=0;i< warmUpTime;i++) { runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f); } freopen("CON", "w", stdout); printf("Warm-up complete.\n\n\n"); } #ifdef _DEBUG freopen("out.txt", "w", stdout); #endif // _DEBUG runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime); freopen("CON", "w", stdout); delete[] imageList; //TestReduce(); cublasShutdown(); if(!isProfiler) { CUT_EXIT(argc, argv); } //getchar(); return 0; }
void describe_cuda_error(const char *whence, CUresult e) { // CUresult e2; switch(e){ case CUDA_SUCCESS: sprintf(DEFAULT_ERROR_STRING,"%s: No errors.",whence); NADVISE(DEFAULT_ERROR_STRING); break; #if CUDA_VERSION >= 6050 RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,"Invalid graphics context") #endif #if CUDA_VERSION > 4000 RUNTIME_ERROR_CASE(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,"Peer access unsupported") RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_PTX,"Invalid PTX") RUNTIME_ERROR_CASE(CUDA_ERROR_ILLEGAL_ADDRESS,"Illegal address") RUNTIME_ERROR_CASE(CUDA_ERROR_ASSERT,"Assertion error") RUNTIME_ERROR_CASE(CUDA_ERROR_TOO_MANY_PEERS,"Too many peers") RUNTIME_ERROR_CASE(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,"Host mem already registered") RUNTIME_ERROR_CASE(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,"Host mem not registered") RUNTIME_ERROR_CASE(CUDA_ERROR_HARDWARE_STACK_ERROR,"H/W stack error") RUNTIME_ERROR_CASE(CUDA_ERROR_ILLEGAL_INSTRUCTION,"Illegal instruction"); RUNTIME_ERROR_CASE(CUDA_ERROR_MISALIGNED_ADDRESS,"Misaligned address") RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_ADDRESS_SPACE,"Invalid address space") RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_PC,"Invalid PC") RUNTIME_ERROR_CASE(CUDA_ERROR_NOT_PERMITTED,"Not permitted") RUNTIME_ERROR_CASE(CUDA_ERROR_NOT_SUPPORTED,"Not supported") #endif // CUDA_VERSION > 4000 RUNTIME_ERROR_CASE(CUDA_ERROR_LAUNCH_FAILED,"Launch failed") RUNTIME_ERROR_CASE(CUDA_ERROR_UNKNOWN,"Unknown error") RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_DEVICE , "Invalid device." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NO_DEVICE , "No device" ) RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_VALUE , "Invalid value." ) RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_IMAGE,"Invalid Image") RUNTIME_ERROR_CASE(CUDA_ERROR_INVALID_CONTEXT,"Invalid context") #ifdef CUDA_ERROR_NVLINK_UNCORRECTABLE RUNTIME_ERROR_CASE(CUDA_ERROR_NVLINK_UNCORRECTABLE,"uncorrectable NVLink error") #endif // CUDA_ERROR_NVLINK_UNCORRECTABLE //RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_PITCH_VALUE , "Invalid pitch value." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_SYMBOL , "Invalid symbol." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_MAP_OBJECT_FAILED , "Map buffer object failed." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_UNMAP_OBJECT_FAILED , "Unmap buffer object failed." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_HOST_POINTER , "Invalid host pointer." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_DEVICE_POINTER , "Invalid device pointer." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_TEXTURE , "Invalid texture." ) //RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_TEXTURE_BINDING , "Invalid texture binding." ) RUNTIME_ERROR_CASE( CUDA_ERROR_OUT_OF_MEMORY , "out of memory." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_INITIALIZED , "not initialized." ) RUNTIME_ERROR_CASE( CUDA_ERROR_DEINITIALIZED , "de-initialized." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_DISABLED , "profiler is disabled." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_NOT_INITIALIZED , "profiler not initialized." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_ALREADY_STARTED , "profiler already started." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PROFILER_ALREADY_STOPPED , "profiler already stopped." ) RUNTIME_ERROR_CASE( CUDA_ERROR_CONTEXT_ALREADY_CURRENT , "context already current." ) RUNTIME_ERROR_CASE( CUDA_ERROR_MAP_FAILED , "mapping failure." ) RUNTIME_ERROR_CASE( CUDA_ERROR_UNMAP_FAILED , "unmapping failure." ) RUNTIME_ERROR_CASE( CUDA_ERROR_ARRAY_IS_MAPPED , "array is mapped and cannot be destroyed." ) RUNTIME_ERROR_CASE( CUDA_ERROR_ALREADY_MAPPED , "already mapped." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NO_BINARY_FOR_GPU , "no binary for GPU." ) RUNTIME_ERROR_CASE( CUDA_ERROR_ALREADY_ACQUIRED , "resource already acquired." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_MAPPED , "resource not mapped." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_MAPPED_AS_ARRAY , "not mapped as array." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_MAPPED_AS_POINTER , "not mapped as pointer." ) RUNTIME_ERROR_CASE( CUDA_ERROR_ECC_UNCORRECTABLE , "uncorrectable ECC error." ) RUNTIME_ERROR_CASE( CUDA_ERROR_UNSUPPORTED_LIMIT , "unsupported limit." ) RUNTIME_ERROR_CASE( CUDA_ERROR_CONTEXT_ALREADY_IN_USE , "context already in use." ) RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_SOURCE , "invalide device kernel source." ) RUNTIME_ERROR_CASE( CUDA_ERROR_FILE_NOT_FOUND , "file not found." ) RUNTIME_ERROR_CASE( CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND , "shared object symbol not found." ) RUNTIME_ERROR_CASE( CUDA_ERROR_SHARED_OBJECT_INIT_FAILED , "shared object init failed." ) RUNTIME_ERROR_CASE( CUDA_ERROR_OPERATING_SYSTEM , "OS call failed." ) RUNTIME_ERROR_CASE( CUDA_ERROR_INVALID_HANDLE , "invalid handle." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_FOUND , "named symbol not found." ) RUNTIME_ERROR_CASE( CUDA_ERROR_NOT_READY , "async operation not completed (not an error)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES , "launch out of resources)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_LAUNCH_TIMEOUT , "launch timeout)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING , "incompatible texturing)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED , "peer access already enabled)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PEER_ACCESS_NOT_ENABLED , "peer access not enabled)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_CONTEXT_IS_DESTROYED , "current context has been destroyed)." ) RUNTIME_ERROR_CASE( CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE , "primary context already initialized)." ) #ifdef FOOBAR //CUDA_RUNTIME_ERROR( cudaErrorECCUncorrectable , "Uncorrectable ECC error detected." ) CUDA_RUNTIME_ERROR( cudaErrorStartupFailure , "Startup failure." ) #endif // FOOBAR default: sprintf(DEFAULT_ERROR_STRING, "%s: unrecognized cuda error code %d",whence,e); NWARN(DEFAULT_ERROR_STRING); break; } #ifdef FOOBAR e2 = cudaGetLastError(); // clear error #ifdef CAUTIOUS if( e2 != e ){ NERROR1("CAUTIOUS: describe_cuda_error: errors do not match!?"); } #endif /* CAUTIOUS */ #endif // FOOBAR }
void CUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int64 besttime=std::numeric_limits<int64>::max(); if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { AllocateResources(numb,numt); // clear out any existing error cudaError_t err=cudaGetLastError(); err=cudaSuccess; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==0; it+=(numb*numt)) { cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice)); cuda_process_helper(m_devin,m_devout,64,6,numb,numt); cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost)); err=cudaGetLastError(); if(err!=cudaSuccess) { printf("CUDA error %d\n",err); } } int64 et=GetTimeMillis(); printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime); if((et-st)<besttime && err==cudaSuccess) { bestb=numb; bestt=numt; besttime=et-st; } } } m_numb=bestb; m_numt=bestt; AllocateResources(m_numb,m_numt); }
void describe_cuda_driver_error(const char *whence, cudaError_t e) { cudaError_t e2; switch(e){ DRIVER_ERROR_CASE(cudaSuccess,"No driver errors") #if CUDA_VERSION >= 6050 DRIVER_ERROR_CASE(cudaErrorInvalidGraphicsContext,"Invalid graphics context") DRIVER_ERROR_CASE( cudaErrorInvalidPtx , "Invalid PTX." ) #endif DRIVER_ERROR_CASE( cudaErrorInvalidDevice , "Invalid device." ) DRIVER_ERROR_CASE( cudaErrorInvalidValue , "Invalid value." ) DRIVER_ERROR_CASE( cudaErrorInvalidPitchValue , "Invalid pitch value." ) DRIVER_ERROR_CASE( cudaErrorInvalidSymbol , "Invalid symbol." ) DRIVER_ERROR_CASE( cudaErrorMapBufferObjectFailed , "Map buffer object failed." ) DRIVER_ERROR_CASE( cudaErrorUnmapBufferObjectFailed , "Unmap buffer object failed." ) DRIVER_ERROR_CASE( cudaErrorInvalidHostPointer , "Invalid host pointer." ) DRIVER_ERROR_CASE( cudaErrorInvalidDevicePointer , "Invalid device pointer." ) DRIVER_ERROR_CASE( cudaErrorInvalidTexture , "Invalid texture." ) DRIVER_ERROR_CASE( cudaErrorInvalidTextureBinding , "Invalid texture binding." ) DRIVER_ERROR_CASE( cudaErrorInvalidChannelDescriptor , "Invalid channel descriptor." ) DRIVER_ERROR_CASE( cudaErrorInvalidMemcpyDirection , "Invalid memcpy direction." ) DRIVER_ERROR_CASE( cudaErrorAddressOfConstant , "Address of constant error." ) DRIVER_ERROR_CASE( cudaErrorTextureFetchFailed , "Texture fetch failed." ) DRIVER_ERROR_CASE( cudaErrorTextureNotBound , "Texture not bound error." ) DRIVER_ERROR_CASE( cudaErrorSynchronizationError , "Synchronization error." ) DRIVER_ERROR_CASE( cudaErrorInvalidResourceHandle , "Invalid resource handle." ) DRIVER_ERROR_CASE( cudaErrorNotReady , "Not ready error." ) DRIVER_ERROR_CASE( cudaErrorInsufficientDriver , "CUDA runtime is newer than driver." ) DRIVER_ERROR_CASE( cudaErrorSetOnActiveProcess , "Set on active process error." ) DRIVER_ERROR_CASE( cudaErrorNoDevice , "No available CUDA device." ) DRIVER_ERROR_CASE( cudaErrorMissingConfiguration , "Missing configuration error." ) DRIVER_ERROR_CASE( cudaErrorMemoryAllocation, "Memory allocation error." ) DRIVER_ERROR_CASE( cudaErrorInitializationError , "Initialization error." ) DRIVER_ERROR_CASE( cudaErrorLaunchFailure , "Launch failure." ) DRIVER_ERROR_CASE( cudaErrorPriorLaunchFailure , "Prior launch failure." ) DRIVER_ERROR_CASE( cudaErrorLaunchTimeout , "Launch timeout error." ) DRIVER_ERROR_CASE( cudaErrorLaunchOutOfResources , "Launch out of resources error." ) DRIVER_ERROR_CASE( cudaErrorInvalidDeviceFunction , "Invalid device function." ) DRIVER_ERROR_CASE( cudaErrorInvalidConfiguration , "Invalid configuration." ) DRIVER_ERROR_CASE( cudaErrorInvalidFilterSetting , "Invalid filter setting." ) DRIVER_ERROR_CASE( cudaErrorInvalidNormSetting , "Invalid norm setting." ) DRIVER_ERROR_CASE(cudaErrorMixedDeviceExecution,"Mixed device execution") DRIVER_ERROR_CASE(cudaErrorCudartUnloading,"CUDA runtime unloading") DRIVER_ERROR_CASE(cudaErrorUnknown,"Unknown error condition") DRIVER_ERROR_CASE(cudaErrorNotYetImplemented,"Function not yet implemented") DRIVER_ERROR_CASE(cudaErrorMemoryValueTooLarge,"Memory value too large") DRIVER_ERROR_CASE(cudaErrorInvalidSurface,"Invalid surface") DRIVER_ERROR_CASE(cudaErrorECCUncorrectable,"ECC uncorrectable") DRIVER_ERROR_CASE(cudaErrorSharedObjectSymbolNotFound,"Shared object symbol not found") DRIVER_ERROR_CASE(cudaErrorSharedObjectInitFailed,"Shared object init failed") DRIVER_ERROR_CASE(cudaErrorUnsupportedLimit,"Unsupported limit") DRIVER_ERROR_CASE(cudaErrorDuplicateVariableName,"Duplicate variable name") DRIVER_ERROR_CASE(cudaErrorDuplicateTextureName,"Duplicate texture name") DRIVER_ERROR_CASE(cudaErrorDuplicateSurfaceName,"Duplicate surface name") DRIVER_ERROR_CASE(cudaErrorDevicesUnavailable,"Devices unavailable") DRIVER_ERROR_CASE(cudaErrorInvalidKernelImage,"Invalid kernel image") DRIVER_ERROR_CASE(cudaErrorNoKernelImageForDevice,"No kernel image for device") DRIVER_ERROR_CASE(cudaErrorIncompatibleDriverContext,"Incompatible driver context") DRIVER_ERROR_CASE(cudaErrorPeerAccessAlreadyEnabled,"Peer access already enabled") DRIVER_ERROR_CASE(cudaErrorPeerAccessNotEnabled,"Peer access not enabled") DRIVER_ERROR_CASE(cudaErrorDeviceAlreadyInUse,"Device already in use") DRIVER_ERROR_CASE(cudaErrorProfilerDisabled,"Profiler disabled") DRIVER_ERROR_CASE(cudaErrorProfilerNotInitialized,"Profiler not intialized") DRIVER_ERROR_CASE(cudaErrorProfilerAlreadyStarted,"Profiler already started") DRIVER_ERROR_CASE(cudaErrorProfilerAlreadyStopped,"Profiler already stopped") #if CUDA_VERSION > 4000 DRIVER_ERROR_CASE(cudaErrorAssert,"Assertion error") DRIVER_ERROR_CASE(cudaErrorTooManyPeers,"Too many peers") DRIVER_ERROR_CASE(cudaErrorHostMemoryAlreadyRegistered,"Host mem already registered") DRIVER_ERROR_CASE(cudaErrorHostMemoryNotRegistered,"Host memory not registered") DRIVER_ERROR_CASE(cudaErrorOperatingSystem,"OS error") DRIVER_ERROR_CASE(cudaErrorPeerAccessUnsupported,"Peer access unsupported") DRIVER_ERROR_CASE(cudaErrorLaunchMaxDepthExceeded,"Launch max depth exceeded") DRIVER_ERROR_CASE(cudaErrorLaunchFileScopedTex,"Launch file scoped tex") DRIVER_ERROR_CASE(cudaErrorLaunchFileScopedSurf,"Launch file scoped surf") DRIVER_ERROR_CASE(cudaErrorSyncDepthExceeded,"Sync depth exceeded") DRIVER_ERROR_CASE(cudaErrorLaunchPendingCountExceeded,"Launch pending count exceeded") DRIVER_ERROR_CASE(cudaErrorNotPermitted,"Not permitted") DRIVER_ERROR_CASE(cudaErrorNotSupported,"Not supported") DRIVER_ERROR_CASE(cudaErrorHardwareStackError,"H/W Stack Error") DRIVER_ERROR_CASE(cudaErrorIllegalInstruction,"Illegal instruction") DRIVER_ERROR_CASE(cudaErrorMisalignedAddress,"Mis-aligned address") DRIVER_ERROR_CASE(cudaErrorInvalidAddressSpace,"Invalid address space") DRIVER_ERROR_CASE(cudaErrorInvalidPc,"Invalid PC") DRIVER_ERROR_CASE(cudaErrorIllegalAddress,"Illegal address") #endif // CUDA_VERSION > 4000 DRIVER_ERROR_CASE(cudaErrorStartupFailure,"Startup failure") DRIVER_ERROR_CASE(cudaErrorApiFailureBase,"Unexpected driver error") #ifdef WHAT_CUDA_VERSION // need to fix for cuda 6? // not in cuda 6? CUDA_DRIVER_ERROR( CUDA_ERROR_LAUNCH_FAILED , "launch failed)." ) // not in cuda 6? CUDA_DRIVER_ERROR( CUDA_ERROR_UNKNOWN , "unknown error)." ) #endif // WHAT_CUDA_VERSION default: sprintf(DEFAULT_ERROR_STRING, "%s: unrecognized cuda error code %d",whence,e); NWARN(DEFAULT_ERROR_STRING); break; } e2 = cudaGetLastError(); // clear error #ifdef CAUTIOUS if( e2 != e ){ sprintf(DEFAULT_ERROR_STRING, "e = %d (0x%x), cudaGetLastError() = %d (0x%x)",e,e,e2,e2); NADVISE(DEFAULT_ERROR_STRING); NERROR1("CAUTIOUS: describe_cuda_driver_error: errors do not match!?"); } #endif /* CAUTIOUS */ }
<<<volume_gridSIZE, volume_blockSIZE>>> ( order, input, output ); #endif flux_term6a<FLOAT_TYPE> <<<flux_gridSIZE, flux_blockSIZE>>> ( order, mesh.device_info, input, output, pen ); flux_term6b<FLOAT_TYPE> <<<flux_gridSIZE, flux_blockSIZE>>> ( order, mesh.device_info, input, output, pen ); #if 0 cudaError_t error = cudaGetLastError(); std::string lastError = cudaGetErrorString(error); std::cout<<lastError<<std::endl; #endif return 0; } int _prec_mvm ( mode_vector<FLOAT_TYPE,int> input, mode_vector<FLOAT_TYPE,int> output ) const { #ifdef USE_PRECONDITIONER
ParallelCUDA() { SetName("Hask"); } void ConvolverObs:: ConvInit() throw (...) { // Get the data we want. m_data = (float *)GetData().ToCUDAArray(); // Set up the filter. //CopyFilter((float *)GetFilter().GetData(), (int)GetRadius()); // Get the temporary arrays. cudaError ret = cudaGetLastError(); if (ret != cudaSuccess) { throw cudaGetErrorString(ret); } m_smoothX = (float *)GetSmoothX().ToCUDAArray(), m_smoothY = (float *)GetSmoothY().ToCUDAArray(); } void ConvolverObs:: Execute() throw (...) { //DoTraceMessage(TIMING, "%s%s%s", "Start (", GetName(), "): Set"); Log("Set"); //cudaError
int main(int argc, char **argv) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW2_output.png"; reference_file = "HW2_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW2_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 6: useEpsCheck=true; input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); perPixelError = atof(argv[4]); globalError = atof(argv[5]); break; default: std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred, &h_filter, &filterWidth, input_file); allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth); GpuTimer timer; timer.Start(); //call the students' code your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("Your GPU code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } //check results and output the blurred image size_t numPixels = numRows()*numCols(); //copy the output back to the host checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); postProcess(output_file, h_outputImageRGBA); timer.Start(); referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, numRows(), numCols(), h_filter, filterWidth); timer.Stop(); printf("Your CPU code ran in: %f msecs.\n", timer.Elapsed()); postProcess(reference_file, h_outputImageRGBA); // Cheater easy way with OpenCV //generateReferenceImage(input_file, reference_file, filterWidth); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); cleanUp(); return 0; }
TEST_F(CublasWrapperTest, matrixTransMatrixMultiply) { const int numberOfRows = 5; const int numberOfColumns = 3; const int numberOfRows2 = 5; const int numberOfColumns2 = 4; PinnedHostMatrix matrixT(numberOfRows, numberOfColumns); PinnedHostMatrix matrix2(numberOfRows2, numberOfColumns2); matrixT(0, 0) = 1; matrixT(1, 0) = 2; matrixT(2, 0) = 3; matrixT(3, 0) = 4; matrixT(4, 0) = 5; matrixT(0, 1) = 10; matrixT(1, 1) = 20; matrixT(2, 1) = 30; matrixT(3, 1) = 40; matrixT(4, 1) = 50; matrixT(0, 2) = 1.1; matrixT(1, 2) = 2.2; matrixT(2, 2) = 3.3; matrixT(3, 2) = 4.4; matrixT(4, 2) = 5.5; for(int i = 0; i < numberOfRows2; ++i){ matrix2(i, 0) = 6; } for(int i = 0; i < numberOfRows2; ++i){ matrix2(i, 1) = 7; } for(int i = 0; i < numberOfRows2; ++i){ matrix2(i, 2) = 8; } for(int i = 0; i < numberOfRows2; ++i){ matrix2(i, 3) = 9; } DeviceMatrix* matrixTDevice = hostToDeviceStream1.transferMatrix(matrixT); DeviceMatrix* matrix2Device = hostToDeviceStream1.transferMatrix(matrix2); DeviceMatrix* resultDevice = new DeviceMatrix(numberOfColumns, numberOfColumns2); cublasWrapper.matrixTransMatrixMultiply(*matrixTDevice, *matrix2Device, *resultDevice); cublasWrapper.syncStream(); handleCudaStatus(cudaGetLastError(), "Error with matrixTransMatrixMultiply in matrixTransMatrixMultiply: "); HostMatrix* resultHost = deviceToHostStream1.transferMatrix(*resultDevice); cublasWrapper.syncStream(); handleCudaStatus(cudaGetLastError(), "Error with transfer in matrixTransMatrixMultiply: "); EXPECT_EQ(90, (*resultHost)(0, 0)); EXPECT_EQ(105, (*resultHost)(0, 1)); EXPECT_EQ(120, (*resultHost)(0, 2)); EXPECT_EQ(135, (*resultHost)(0, 3)); EXPECT_EQ(900, (*resultHost)(1, 0)); EXPECT_EQ(1050, (*resultHost)(1, 1)); EXPECT_EQ(1200, (*resultHost)(1, 2)); EXPECT_EQ(1350, (*resultHost)(1, 3)); EXPECT_EQ(99, (*resultHost)(2, 0)); EXPECT_EQ(115.5, (*resultHost)(2, 1)); EXPECT_EQ(132, (*resultHost)(2, 2)); EXPECT_EQ(148.5, (*resultHost)(2, 3)); delete matrixTDevice; delete matrix2Device; delete resultDevice; delete resultHost; }
cudaError_t GridGpu::transferData(complexVector &trajData) { cudaMemcpy(m_d_trajData, trajData.data(), trajData.size() * sizeof(complexGpu), cudaMemcpyHostToDevice); return cudaGetLastError(); }
int _tmain(int argc, _TCHAR* argv[]) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; //PreProcess const std::string *filename = new std::string("./cinque_terre_small.jpg"); cv::Mat imageInputRGBA; cv::Mat imageOutputRGBA; //make sure the context initializes ok checkCudaErrors(cudaFree(0)); cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR); if (image.empty()) { std::cerr << "Couldn't open file: " << filename << std::endl; cv::waitKey(0); exit(1); } cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); //allocate memory for the output imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); //This shouldn't ever happen given the way the images are created //at least based upon my limited understanding of OpenCV, but better to check if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { std::cerr << "Images aren't continuous!! Exiting." << std::endl; exit(1); } h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0); h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0); const size_t numPixels = image.rows * image.cols; //allocate memory on the device for both input and output checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around //copy input array to the GPU checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); //now create the filter that they will use const int blurKernelWidth = 9; const float blurKernelSigma = 2.; filterWidth = blurKernelWidth; //create and fill the filter we will convolve with h_filter = new float[blurKernelWidth * blurKernelWidth]; float filterSum = 0.f; //for normalization for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; filterSum += filterValue; } } float normalizationFactor = 1.f / filterSum; for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; } } //blurred checkCudaErrors(cudaMalloc(&d_redBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(&d_greenBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(&d_blueBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_redBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth); GpuTimer timer; timer.Start(); //call the students' code your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols, d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("%f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } cleanup(); //check results and output the blurred image //PostProcess //copy the output back to the host checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); cv::Mat imageOutputBGR; cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR); //output the image cv::imwrite("./blurredResult.jpg", imageOutputBGR); cv::namedWindow( "Display window", CV_WINDOW_NORMAL); cv::imshow("Display window", imageOutputBGR); cv::waitKey(0); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); return 0; }
void SimCudaHelper::CheckError(const char *msg) { cudaError_t err = cudaGetLastError(); CheckError(err, msg); }
void testCuda(int m, int n, int nnz, std::vector<int>& rows, std::vector<int>& cols, std::vector<double>& values, double* matB){ double tol=1e-9; double start, stop, time_to_build, time_to_solve; int cudaDevice = 0; checkCudaErrors(cudaSetDevice(cudaDevice)); cudaDeviceProp prop; cudaGetDeviceProperties(&prop, cudaDevice); printf("Device Number: %d\n", cudaDevice); printf(" Device name: %s\n", prop.name); checkCudaErrors(cudaDeviceReset()); size_t mem_tot = 0; size_t mem_free = 0; cudaMemGetInfo(&mem_free, & mem_tot); printf("\nFree memory: %d", mem_free); MatSparse matA; matA.setSize(m, n); std::vector<int> I, J; std::vector<double> V; for (int k = 0; k < nnz; k++){ double _val = values[k]; int i = rows[k]; int j = cols[k]; if (fabs(_val) > tol){ I.push_back(i-1); J.push_back(j-1); V.push_back(_val); } } start = second(); matA.fromTruples(I, J, V); stop = second(); time_to_build = stop - start; std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl; // ******************************** GPU SOLVER ******************************** // // --- Initialize cuSPARSE cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle)); cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle)); cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream)); checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream)); cusparseMatDescr_t descrA; checkCudaErrors(cusparseCreateMatDescr(&descrA)); checkCudaErrors(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); printf("\nAlloc GPU memory...\n"); double *d_A; checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double))); int *d_A_RowIndices; checkCudaErrors(cudaMalloc(&d_A_RowIndices, (m + 1) * sizeof(int))); int *d_A_ColIndices; checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int))); double *d_x; checkCudaErrors(cudaMalloc(&d_x, m * sizeof(double))); double *d_b; checkCudaErrors(cudaMalloc(&d_b, m * sizeof(double))); printf("\nError: %s", cudaGetErrorString(cudaGetLastError())); printf("\nCopying data...\n"); checkCudaErrors(cudaMemcpy(d_A, matA.valuesPtr(), nnz * sizeof(double), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_RowIndices, matA.RowPtr(), (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_ColIndices, matA.ColIdxPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_b, matB, m * sizeof(double), cudaMemcpyHostToDevice)); double *h_x = (double *)malloc(m * sizeof(double)); printf("\nError: %s", cudaGetErrorString(cudaGetLastError())); cudaMemGetInfo(&mem_free, &mem_tot); printf("\nFree memory: %d", mem_free); int reorder = 0; int singularity = 0; start = second(); //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(), // sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity)); checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_b, tol, reorder, d_x, &singularity)); checkCudaErrors(cudaDeviceSynchronize()); stop = second(); time_to_solve = stop - start; checkCudaErrors(cudaMemcpy(h_x, d_x, m * sizeof(double), cudaMemcpyDeviceToHost)); // for (int k=0; k<mA.getNumRows(); k++) solution[k] = h_x[k]; checkCudaErrors(cusparseDestroy(cusparse_handle)); checkCudaErrors(cusolverSpDestroy(cusolver_handle)); checkCudaErrors(cudaStreamDestroy(cudaStream)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_A_RowIndices)); checkCudaErrors(cudaFree(d_A_ColIndices)); free(h_x); std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl; std::cerr << "Time to Solve in GPU (second): " << time_to_solve << std::endl; std::cerr << "done!"; // ****************************************************************************** // }
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done) { static THREAD uint32_t *foundNounce = nullptr; uint32_t start_nonce = pdata[19]; unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23; uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity); uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) ptarget[7] = 0x0000000f; // init static THREAD volatile bool init = false; if(!init) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); groestlcoin_cpu_init(thr_id, throughputmax); CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4)); init = true; } // Endian Drehung ist notwendig uint32_t endiandata[32]; for (int kk=0; kk < 32; kk++) be32enc(&endiandata[kk], pdata[kk]); // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) groestlcoin_cpu_setBlock(thr_id, endiandata); do { // GPU const uint32_t Htarg = ptarget[7]; groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]); if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} if(foundNounce[0] < 0xffffffff) { uint32_t tmpHash[8]; endiandata[19] = SWAP32(foundNounce[0]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { int res = 1; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]); *hashes_done = pdata[19] - start_nonce + throughput; if(foundNounce[1] != 0xffffffff) { endiandata[19] = SWAP32(foundNounce[1]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { pdata[21] = foundNounce[1]; res++; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]); } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]); } } } pdata[19] = foundNounce[0]; return res; } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]); } } } pdata[19] += throughput; cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); exit(EXIT_FAILURE); } } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - start_nonce; return 0; }
int main(int argc, char **argv) { unsigned int *inputVals; unsigned int *inputPos; unsigned int *outputVals; unsigned int *outputPos; size_t numElems; std::string input_file; std::string template_file; std::string output_file; std::string reference_file = "red_eye_effect.gold"; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 3: input_file = std::string(argv[1]); template_file = std::string(argv[2]); output_file = "HW4_output.png"; break; case 4: input_file = std::string(argv[1]); template_file = std::string(argv[2]); output_file = std::string(argv[3]); break; default: std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file); GpuTimer timer; timer.Start(); //call the students' code your_sort(inputVals, inputPos, outputVals, outputPos, numElems); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); printf("\n"); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } //check results and output the red-eye corrected image postProcess(outputVals, outputPos, numElems, output_file); // check code moved from HW4.cu /**************************************************************************** * You can use the code below to help with debugging, but make sure to * * comment it out again before submitting your assignment for grading, * * otherwise this code will take too much time and make it seem like your * * GPU implementation isn't fast enough. * * * * This code MUST RUN BEFORE YOUR CODE in case you accidentally change * * the input values when implementing your radix sort. * * * * This code performs the reference radix sort on the host and compares your * * sorted values to the reference. * * * * Thrust containers are used for copying memory from the GPU * * ************************************************************************* */ thrust::device_ptr<unsigned int> d_inputVals(inputVals); thrust::device_ptr<unsigned int> d_inputPos(inputPos); thrust::host_vector<unsigned int> h_inputVals(d_inputVals, d_inputVals+numElems); thrust::host_vector<unsigned int> h_inputPos(d_inputPos, d_inputPos + numElems); thrust::host_vector<unsigned int> h_outputVals(numElems); thrust::host_vector<unsigned int> h_outputPos(numElems); reference_calculation(&h_inputVals[0], &h_inputPos[0], &h_outputVals[0], &h_outputPos[0], numElems); //postProcess(valsPtr, posPtr, numElems, reference_file); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); thrust::device_ptr<unsigned int> d_outputVals(outputVals); thrust::device_ptr<unsigned int> d_outputPos(outputPos); thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals, d_outputVals + numElems); thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos, d_outputPos + numElems); checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems); //checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems); checkCudaErrors(cudaFree(inputVals)); checkCudaErrors(cudaFree(inputPos)); checkCudaErrors(cudaFree(outputVals)); checkCudaErrors(cudaFree(outputPos)); return 0; }
int main(int argc, char **argv) { printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y); int *host_current, *host_future, *host_future_naive, *host_future_cached; int *gpu_current, *gpu_future; clock_t start, stop; cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future_naive, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future_cached, DIM_X * DIM_Y * sizeof(int)); assert(cudaGetLastError() == cudaSuccess); cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int)); cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int)); printf("%s\n", cudaGetErrorString(cudaGetLastError())); assert(cudaGetLastError() == cudaSuccess); fill_board(host_current, 40); add_glider(host_current); cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice); // print_board(host_current); float time_naive, time_cached, time_cpu; for(int i = 1; i < STEPS; i++) { printf("=========\n"); start = clock(); naive_game_of_life_wrapper(gpu_current, gpu_future); cudaMemcpy(host_future_naive, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); stop = clock(); time_naive = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for Naive GPU To Compute Next Phase: %.5f s\n", time_naive); start = clock(); cached_game_of_life_wrapper(gpu_current, gpu_future); cudaMemcpy(host_future_cached, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); stop = clock(); time_cached = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for Cached GPU To Compute Next Phase: %.5f s\n", time_cached); start = clock(); update_board(host_current, host_future); stop = clock(); time_cpu = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for CPU To Compute Next Phase: %.5f s\n", time_cpu); printf("speedup for naive = %.2f; speedup for cached = %.2f; speedup for cached over naive = %.2f\n", time_cpu/time_naive, time_cpu/time_cached, time_naive/time_cached); check_boards(host_future, host_future_naive); check_boards(host_future, host_future_cached); int *temp; temp = host_current; host_current = host_future; host_future = temp; temp = gpu_current; gpu_current = gpu_future; gpu_future = temp; } cudaFree(host_future); cudaFree(host_future_naive); cudaFree(host_future_cached); cudaFree(host_current); cudaFree(gpu_current); cudaFree(gpu_future); return 0; }
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input, const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs, size_t coeffSize ) { CuScalarT *dev_in = NULL; size_t pitchIn; CuScalarT *dev_prev = NULL; size_t pitchPrev; CuScalarT *dev_out = NULL; size_t pitchOut; CuScalarT *dev_co = NULL; try { CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ), coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) ); CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); if( dev_in ) { CudaSafeCall( cudaFree( ( void* )dev_in ) ); } if( dev_prev ) { CudaSafeCall( cudaFree( ( void* )dev_prev ) ); } if( dev_out ) { CudaSafeCall( cudaFree( ( void* )dev_out ) ); } if( dev_co ) { CudaSafeCall( cudaFree( ( void* )dev_co ) ); } throw WLBadAllocException( "Could not allocate CUDA memory!" ); } size_t threadsPerBlock = 32; size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock; size_t sharedMem = coeffSize * sizeof( CuScalarT ); cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); cudaEventRecord( start, 0 ); cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize, pitchOut, pitchIn, pitchPrev ); cudaError_t kernelError = cudaGetLastError(); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); float elapsedTime; cudaEventElapsedTime( &elapsedTime, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); try { if( kernelError != cudaSuccess ) { const std::string err( cudaGetErrorString( kernelError ) ); throw WException( "CUDA kernel failed: " + err ); } CudaThrowsCall( cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ), channels, cudaMemcpyDeviceToHost ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); elapsedTime = -1.0; } CudaSafeCall( cudaFree( ( void* )dev_in ) ); CudaSafeCall( cudaFree( ( void* )dev_prev ) ); CudaSafeCall( cudaFree( ( void* )dev_out ) ); CudaSafeCall( cudaFree( ( void* )dev_co ) ); if( elapsedTime > -1.0 ) { return elapsedTime; } else { throw WException( "Error in cudaFilter()" ); } }
//-------------------------------------------------------------------------- // CUDA init //-------------------------------------------------------------------------- bool CUDAContext::configInit( ) { #ifdef EQUALIZER_USE_CUDA cudaDeviceProp props; uint32_t device = getPipe()->getDevice(); // Setup the CUDA device if( device == LB_UNDEFINED_UINT32 ) { device = _getFastestDeviceID(); LBWARN << "No CUDA device, using the fastest device: " << device << std::endl; } int device_count = 0; cudaGetDeviceCount( &device_count ); LBINFO << "CUDA devices found: " << device_count << std::endl; LBASSERT( static_cast< uint32_t >( device_count ) > device ); if( static_cast< uint32_t >( device_count ) <= device ) { sendError( ERROR_CUDACONTEXT_DEVICE_NOTFOUND ) << lexical_cast< std::string >( device ); return false; } // We assume GL interop here, otherwise use cudaSetDevice( device ); // Attention: this call requires a valid GL context! cudaGLSetGLDevice( device ); int usedDevice = static_cast< int >( device ); #ifdef _WIN32 HGPUNV handle = 0; if( !WGLEW_NV_gpu_affinity ) { LBWARN <<"WGL_NV_gpu_affinity unsupported, ignoring device setting" << std::endl; return true; } if( !wglEnumGpusNV( device, &handle )) { LBWARN << "wglEnumGpusNV failed : " << lunchbox::sysError << std::endl; return false; } cudaWGLGetDevice( &usedDevice, handle ); #else cudaGetDevice( &usedDevice ); #endif LBASSERT( device == static_cast< uint32_t >( device )); cudaGetDeviceProperties( &props, usedDevice ); cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { sendError( ERROR_CUDACONTEXT_INIT_FAILED ) << std::string( cudaGetErrorString( err )); return false; } LBINFO << "Using CUDA device: " << device << std::endl; return true; #else sendError( ERROR_CUDACONTEXT_MISSING_SUPPORT ); return false; #endif }
static void check_status() { throw_(cudaGetLastError()); }
/*===========================================================================*/ cudaError_t LastError() { return cudaGetLastError(); }
int main(int argc, char **argv) { uchar4 *h_rgbaImage, *d_rgbaImage; unsigned char *h_greyImage, *d_greyImage; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW1_output.png"; reference_file = "HW1_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW1_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 6: useEpsCheck=true; input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); perPixelError = atof(argv[4]); globalError = atof(argv[5]); break; default: std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); GpuTimer timer; timer.Start(); //call the students' code lineDetect(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } size_t numPixels = numRows()*numCols(); checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost)); //check results and output the grey image postProcess(output_file, h_greyImage); referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols()); postProcess(reference_file, h_greyImage); //generateReferenceImage(input_file, reference_file); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); cleanup(); return 0; }
int main(int argc, char **argv) { float *d_luminance; unsigned int *d_cdf; size_t numRows, numCols; unsigned int numBins; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW3_output.png"; reference_file = "HW3_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW3_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 6: useEpsCheck=true; input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); perPixelError = atof(argv[4]); globalError = atof(argv[5]); break; default: std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&d_luminance, &d_cdf, &numRows, &numCols, &numBins, input_file); GpuTimer timer; float min_logLum, max_logLum; min_logLum = 0.f; max_logLum = 1.f; timer.Start(); //call the students' code your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum, numRows, numCols, numBins); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols); unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins); checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost)); //check results and output the tone-mapped image postProcess(output_file, numRows, numCols, min_logLum, max_logLum); for (size_t i = 1; i < numCols * numRows; ++i) { min_logLum = std::min(h_luminance[i], min_logLum); max_logLum = std::max(h_luminance[i], max_logLum); } referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum); checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice)); //check results and output the tone-mapped image postProcess(reference_file, numRows, numCols, min_logLum, max_logLum); cleanupGlobalMemory(); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); return 0; }
int kmeans_cuda(bool kmpp, float tolerance, float yinyang_t, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t verbosity, const float *samples, float *centroids, uint32_t *assignments) { DEBUG("arguments: %d %.3f %.2f %" PRIu32 " %" PRIu16 " %" PRIu32 " %" PRIu32 " %" PRIu32 " %" PRIi32 " %p %p %p\n", kmpp, tolerance, yinyang_t, samples_size, features_size, clusters_size, seed, device, verbosity, samples, centroids, assignments); auto check_result = check_args( tolerance, yinyang_t, samples_size, features_size, clusters_size, samples, centroids, assignments); if (check_result != kmcudaSuccess) { return check_result; } if (cudaSetDevice(device) != cudaSuccess) { return kmcudaNoSuchDevice; } void *device_samples; size_t device_samples_size = samples_size; device_samples_size *= features_size * sizeof(float); CUMALLOC(device_samples, device_samples_size, "samples"); CUMEMCPY(device_samples, samples, device_samples_size, cudaMemcpyHostToDevice); unique_devptr device_samples_sentinel(device_samples); void *device_centroids; size_t centroids_size = clusters_size * features_size * sizeof(float); CUMALLOC(device_centroids, centroids_size, "centroids"); unique_devptr device_centroids_sentinel(device_centroids); void *device_assignments; size_t assignments_size = samples_size * sizeof(uint32_t); CUMALLOC(device_assignments, assignments_size, "assignments"); unique_devptr device_assignments_sentinel(device_assignments); void *device_assignments_prev; CUMALLOC(device_assignments_prev, assignments_size, "assignments_prev"); unique_devptr device_assignments_prev_sentinel(device_assignments_prev); void *device_ccounts; CUMALLOC(device_ccounts, clusters_size * sizeof(uint32_t), "ccounts"); unique_devptr device_ccounts_sentinel(device_ccounts); uint32_t yinyang_groups = yinyang_t * clusters_size; DEBUG("yinyang groups: %" PRIu32 "\n", yinyang_groups); void *device_assignments_yy = NULL, *device_bounds_yy = NULL, *device_drifts_yy = NULL, *device_passed_yy = NULL, *device_centroids_yy = NULL; if (yinyang_groups >= 1) { CUMALLOC(device_assignments_yy, clusters_size * sizeof(uint32_t), "yinyang assignments"); size_t yyb_size = samples_size; yyb_size *= (yinyang_groups + 1) * sizeof(float); CUMALLOC(device_bounds_yy, yyb_size, "yinyang bounds"); CUMALLOC(device_drifts_yy, centroids_size + clusters_size * sizeof(float), "yinyang drifts"); CUMALLOC(device_passed_yy, assignments_size, "yinyang passed"); size_t yyc_size = yinyang_groups * features_size * sizeof(float); if (yyc_size + (clusters_size + yinyang_groups) * sizeof(uint32_t) <= assignments_size) { device_centroids_yy = device_passed_yy; } else { CUMALLOC(device_centroids_yy, yyc_size, "yinyang group centroids"); } } unique_devptr device_centroids_yinyang_sentinel( (device_centroids_yy != device_passed_yy)? device_centroids_yy : NULL); unique_devptr device_assignments_yinyang_sentinel(device_assignments_yy); unique_devptr device_bounds_yinyang_sentinel(device_bounds_yy); unique_devptr device_drifts_yinyang_sentinel(device_drifts_yy); unique_devptr device_passed_yinyang_sentinel(device_passed_yy); if (verbosity > 1) { RETERR(print_memory_stats()); } RETERR(kmeans_cuda_setup(samples_size, features_size, clusters_size, yinyang_groups, device, verbosity), DEBUG("kmeans_cuda_setup failed: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_init_centroids( static_cast<KMCUDAInitMethod>(kmpp), samples_size, features_size, clusters_size, seed, verbosity, reinterpret_cast<float*>(device_samples), device_assignments, reinterpret_cast<float*>(device_centroids)), DEBUG("kmeans_init_centroids failed: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_cuda_yy( tolerance, yinyang_groups, samples_size, clusters_size, features_size, verbosity, reinterpret_cast<float*>(device_samples), reinterpret_cast<float*>(device_centroids), reinterpret_cast<uint32_t*>(device_ccounts), reinterpret_cast<uint32_t*>(device_assignments_prev), reinterpret_cast<uint32_t*>(device_assignments), reinterpret_cast<uint32_t*>(device_assignments_yy), reinterpret_cast<float*>(device_centroids_yy), reinterpret_cast<float*>(device_bounds_yy), reinterpret_cast<float*>(device_drifts_yy), reinterpret_cast<uint32_t*>(device_passed_yy)), DEBUG("kmeans_cuda_internal failed: %s\n", cudaGetErrorString(cudaGetLastError()))); CUMEMCPY(centroids, device_centroids, centroids_size, cudaMemcpyDeviceToHost); CUMEMCPY(assignments, device_assignments, assignments_size, cudaMemcpyDeviceToHost); DEBUG("return kmcudaSuccess\n"); return kmcudaSuccess; }