int allocate_managed_buffer (char ** buffer) { #ifdef _ENABLE_CUDA_ cudaError_t cuerr = cudaSuccess; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case cuda: cuerr = cudaMallocManaged((void **)buffer, MYBUFSIZE, cudaMemAttachGlobal); if (cudaSuccess != cuerr) { fprintf(stderr, "Could not allocate device memory\n"); return 1; } break; #endif default: fprintf(stderr, "Could not allocate device memory\n"); return 1; } return 0; }
// handles the allocation and creation of an object: // something like: // X *x = new X() ~ void *temp = operator new(sizeof(X)); X *x = (temp)X; void *operator new(size_t len) { void *ptr; cudaMallocManaged(&ptr, len); cudaDeviceSynchronize(); return ptr; }
void *MemoryPool::pop(size_t s, int loc) { void *addr = nullptr; if ((s > MIN_BLOCK_SIZE) && (s < MAX_BLOCK_SIZE)) { locker_.lock(); // find MemoryPool block which is not smaller than demand size auto pt = pool_.lower_bound(s); if (pt != pool_.end()) { size_t ts = 0; std::tie(ts, addr) = *pt; if (ts < s * 2) { s = ts; pool_.erase(pt); pool_depth_ -= s; } else { addr = nullptr; } } locker_.unlock(); } if (addr == nullptr) { try { #ifdef __CUDA__ SP_DEVICE_CALL(cudaMallocManaged(&addr, s)); #else addr = malloc(s); #endif } catch (std::bad_alloc const &error) { THROW_EXCEPTION_BAD_ALLOC(s); } } return addr; }
Mesh::Mesh(const std::vector<Vec3>& vertices, const std::vector<Triangle>& faces) : vertices(), faces(), bvh(), vertexCount(static_cast<uint32_t>(vertices.size())), faceCount(static_cast<uint32_t>(faces.size())) { Vec3* dVertices = nullptr; cudaMallocManaged(&dVertices, sizeof(Vec3) * vertexCount); cudaMemcpy(dVertices, vertices.data(), sizeof(Vec3) * vertexCount, cudaMemcpyHostToHost); this->vertices.reset(dVertices, vertexCount); Triangle* dFaces = nullptr; cudaMallocManaged(&dFaces, sizeof(Triangle) * faceCount); cudaMemcpy(dFaces, faces.data(), sizeof(Triangle) * faceCount, cudaMemcpyHostToHost); this->faces.reset(dFaces, faceCount); }
void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const { void * ptr = NULL; CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) ); return ptr ; }
static void SetUpTestCase() { cudaMallocManaged((void**)&data, sizeof(data_type) * N, cudaMemAttachGlobal); std::iota(data, data + N, 1); std::shuffle(data, data + N, std::mt19937{std::random_device{}()}); }
int main() { int *c; GlobalState *gs = new GlobalState; CHECK(cudaMallocManaged(&c, sizeof(int))); *c = 0; return 0; }
int lpm_add(struct lpm_table *table, void *prefix, unsigned int prefix_len, unsigned int id, void *ptr, struct ixmap_desc *desc) { unsigned int index; struct lpm_node *node; struct lpm_entry *entry; unsigned int range, mask; int i, ret, entry_allocated = 0; cudaError_t ret_cuda; index = lpm_index(prefix, 0, 16); if(prefix_len > 16) { node = &table->node[index]; ret = _lpm_add(table, prefix, prefix_len, id, ptr, desc, node, 16); if(ret < 0) goto err_lpm_add; } else { range = 1 << (16 - prefix_len); mask = ~(range - 1); index &= mask; for(i = 0; i < range; i++, entry_allocated++) { node = &table->node[index | i]; ret_cuda = cudaMallocManaged((void **)&entry, sizeof(struct lpm_entry), cudaMemAttachGlobal); if(ret_cuda != cudaSuccess) goto err_lpm_add_self; entry->ptr = ptr; ret = lpm_entry_insert(table, &node->head, id, prefix_len, &entry->list); if(ret < 0) goto err_entry_insert; continue; err_entry_insert: cudaFree(entry); goto err_lpm_add_self; } } return 0; err_lpm_add_self: for(i = 0; i < entry_allocated; i++) { node = &table->node[index | i]; lpm_entry_delete(table, &node->head, id, prefix_len); } err_lpm_add: return -1; }
KOKKOS_INLINE_FUNCTION static T* my_alloc(const int sz) { #if defined(__CUDACC__) && defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_CUDA_UVM) && !defined( __CUDA_ARCH__ ) T* m; cudaMallocManaged( (void**) &m, sz*sizeof(T), cudaMemAttachGlobal ); #else T* m = static_cast<T* >(operator new(sz*sizeof(T))); #endif return m; }
static void *THCUVAAllocator_alloc(void* ctx, ptrdiff_t size) { if (size < 0) THError("Invalid memory size: %ld", size); if (size == 0) return NULL; // See J.1.1 of the CUDA_C_Programming_Guide.pdf for UVA and coherence rules // on various compute capabilities. void* ptr; THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)); return ptr; }
static ucs_status_t ucp_perf_cuda_alloc_managed(ucx_perf_context_t *perf, size_t length, void **address_p, ucp_mem_h *memh_p, int non_blk_flag) { cudaError_t cerr; cerr = cudaMallocManaged(address_p, length, cudaMemAttachGlobal); if (cerr != cudaSuccess) { return UCS_ERR_NO_MEMORY; } return UCS_OK; }
int allocate_buffer (void ** buffer, size_t size, enum accel_type type) { if (options.target == cpu || options.target == both) { allocate_host_arrays(); } size_t alignment = sysconf(_SC_PAGESIZE); #ifdef _ENABLE_CUDA_ cudaError_t cuerr = cudaSuccess; #endif switch (type) { case none: return posix_memalign(buffer, alignment, size); #ifdef _ENABLE_CUDA_ case cuda: cuerr = cudaMalloc(buffer, size); if (cudaSuccess != cuerr) { return 1; } else { return 0; } case managed: cuerr = cudaMallocManaged(buffer, size, cudaMemAttachGlobal); if (cudaSuccess != cuerr) { return 1; } else { return 0; } #endif #ifdef _ENABLE_OPENACC_ case openacc: *buffer = acc_malloc(size); if (NULL == *buffer) { return 1; } else { return 0; } #endif default: return 1; } }
CUDA_TYPED_TEST_P(ScanCUDA, exclusive_inplace_offset) { using T = typename Info<TypeParam>::data_type; using Function = typename Info<TypeParam>::function; T* data; cudaMallocManaged((void**)&data, sizeof(T) * N, cudaMemAttachGlobal); std::copy_n(ScanCUDA<TypeParam>::data, N, data); RAJA::exclusive_scan_inplace( typename Info<TypeParam>::exec(), data, data + N, Function{}, T(2)); ASSERT_TRUE(check_exclusive<Function>(data, ScanCUDA<TypeParam>::data, T(2))); cudaFree(data); }
inline void* allocate(size_t num_bytes) { // switch to our device scoped_device set_current_device(device()); void* result = nullptr; cudaError_t error = cudaMallocManaged(&result, num_bytes, cudaMemAttachGlobal); if(error != cudaSuccess) { throw thrust::system_error(error, thrust::cuda_category(), "managed_resource::allocate(): cudaMallocManaged"); } return result; }
void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const { void * ptr = NULL; Kokkos::Impl::num_uvm_allocations += 1 ; if ( Kokkos::Impl::num_uvm_allocations > 65536 ) { Kokkos::Impl::num_uvm_allocations = 0 ; //Reset to 0 before throwing exception Kokkos::Impl::throw_runtime_exception( "CudaUVM error: The maximum limit of UVM allocations is 65536" ) ; } CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) ); return ptr ; }
CUDA_TYPED_TEST_P(ScanCUDA, exclusive) { using T = typename Info<TypeParam>::data_type; using Function = typename Info<TypeParam>::function; T* out; cudaMallocManaged((void**)&out, sizeof(T) * N, cudaMemAttachGlobal); RAJA::exclusive_scan(typename Info<TypeParam>::exec(), ScanCUDA<TypeParam>::data, ScanCUDA<TypeParam>::data + N, out, Function{}); ASSERT_TRUE(check_exclusive<Function>(out, ScanCUDA<TypeParam>::data)); cudaFree(out); }
ChannelInfo::ChannelInfo(const std::vector<Channels> &channels, bool use_gpu) : use_gpu(use_gpu) { num_channels = (int)channels.size(); radiance_dimension = -1; num_total_dimensions = compute_num_channels(channels); if (use_gpu) { #ifdef __CUDACC__ checkCuda(cudaMallocManaged(&this->channels, channels.size() * sizeof(Channels))); #else assert(false); #endif } else { this->channels = new Channels[channels.size()]; } for (int i = 0; i < (int)channels.size(); i++) { if (channels[i] == Channels::radiance) { if (radiance_dimension != -1) { throw std::runtime_error("Duplicated radiance channel"); } radiance_dimension = i; } this->channels[i] = channels[i]; } }
static int _lpm_add(struct lpm_table *table, void *prefix, unsigned int prefix_len, unsigned int id, void *ptr, struct ixmap_desc *desc, struct lpm_node *parent, unsigned int offset) { struct lpm_node *node; struct lpm_entry *entry; unsigned int index; unsigned int range, mask; int i, ret, entry_allocated = 0; cudaError_t ret_cuda; if(!parent->next_table) { ret_cuda = cudaMallocManaged((void **)&parent->next_table, sizeof(struct lpm_node) * TABLE_SIZE_8, cudaMemAttachGlobal); if(ret_cuda != cudaSuccess) goto err_table_alloc; for(i = 0; i < TABLE_SIZE_8; i++) { node = &parent->next_table[i]; lpm_init_node(node); } } index = lpm_index(prefix, offset, 8); if(prefix_len - offset > 8) { node = &parent->next_table[index]; ret = _lpm_add(table, prefix, prefix_len, id, ptr, desc, node, offset + 8); if(ret < 0) goto err_lpm_add; } else { range = 1 << (8 - (prefix_len - offset)); mask = ~(range - 1); index &= mask; for(i = 0; i < range; i++) { node = &parent->next_table[index | i]; ret_cuda = cudaMallocManaged((void **)&entry, sizeof(struct lpm_entry), cudaMemAttachGlobal); if(ret_cuda != cudaSuccess) goto err_lpm_add_self; entry->ptr = ptr; ret = lpm_entry_insert(table, &node->head, id, prefix_len, &entry->list); if(ret < 0) goto err_entry_insert; continue; err_entry_insert: cudaFree(entry); goto err_lpm_add_self; } } return 0; err_lpm_add_self: for(i = 0; i < entry_allocated; i++) { node = &parent->next_table[index | i]; lpm_entry_delete(table, &node->head, id, prefix_len); } err_lpm_add: for(i = 0; i < TABLE_SIZE_8; i++) { node = &parent->next_table[i]; if(node->next_table || !list_empty(&node->head)) { goto err_table_alloc; } } cudaFree(parent->next_table); parent->next_table = NULL; err_table_alloc: return -1; }
int main(int argc, char** argv){ double* A; double* B; double* C; double alpha = 1.0; double beta = 0.0; int i; struct timeval t1,t2, t3, t4; const int SEED = 1; const int METHOD = 0; const int BRNG = VSL_BRNG_MCG31; VSLStreamStatePtr stream; int errcode; cublasStatus_t status; cublasHandle_t handle; double a=0.0, b= 1.0; // Uniform distribution between 0 and 1 errcode = vslNewStream(&stream, BRNG, SEED); int width = 100; if (argc > 1){ width = atoi(argv[1]); } /* Allocate memory for A, B, and C */ if (cudaMallocManaged(&A, width * width * sizeof(double)) != cudaSuccess){ fprintf(stderr, "!!!! device memory alocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMallocManaged(&B, width * width * sizeof(double)) != cudaSuccess){ fprintf(stderr, "!!!! device memory alocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMallocManaged(&C, width * width * sizeof(double)) != cudaSuccess){ fprintf(stderr, "!!!! device memory alocation error (allocate C)\n"); return EXIT_FAILURE; } /* Generate width * width random numbers between 0 and 1 to fill matrices A and B. */ errcode = vdRngUniform(METHOD, stream, width * width, A, a, b); CheckVslError(errcode); errcode = vdRngUniform(METHOD, stream, width * width, B, a, b); CheckVslError(errcode); /* Now prepare the call to CUBLAS */ status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } gettimeofday(&t3, NULL); /* Perform calculation */ status = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, width, width, width, &alpha, A, width, B, width, &beta, C, width); if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } cudaDeviceSynchronize(); gettimeofday(&t4, NULL); const double time = (double) (t4.tv_sec - t3.tv_sec) + 1e-6 * (t4.tv_usec - t3.tv_usec); const double Gflops = 2. * width * width * width / (double) time * 10e-9; printf("Call to cublasDGEMM took %lf\n", time); printf("Gflops: %lf\n", Gflops); cudaFree(A); cudaFree(B); cudaFree(C); status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! shutdown error\n"); return EXIT_FAILURE; } return 0; }
int main(int argc, char **argv) { int N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; float dot; float *r, *p, *Ax; int k; float alpha, beta, alpham1; printf("Starting [%s]...\n", sSDKname); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); #if defined(__APPLE__) || defined(MACOSX) fprintf(stderr, "Unified Memory not currently supported on OS X\n"); cudaDeviceReset(); exit(EXIT_WAIVED); #endif if (sizeof(void *) != 8) { fprintf(stderr, "Unified Memory requires compiling for a 64-bit system.\n"); cudaDeviceReset(); exit(EXIT_WAIVED); } if (((deviceProp.major << 4) + deviceProp.minor) < 0x30) { fprintf(stderr, "%s requires Compute Capability of SM 3.0 or higher to run.\nexiting...\n", argv[0]); cudaDeviceReset(); exit(EXIT_WAIVED); } // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); /* Generate a random tridiagonal symmetric matrix in CSR format */ N = 1048576; nz = (N-2)*3 + 4; cudaMallocManaged((void **)&I, sizeof(int)*(N+1)); cudaMallocManaged((void **)&J, sizeof(int)*nz); cudaMallocManaged((void **)&val, sizeof(float)*nz); genTridiag(I, J, val, N, nz); cudaMallocManaged((void **)&x, sizeof(float)*N); cudaMallocManaged((void **)&rhs, sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // temp memory for CG checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float))); cudaDeviceSynchronize(); for (int i=0; i < N; i++) { r[i] = rhs[i]; } alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax); cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax); cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } printf("Final residual: %e\n",sqrt(r1)); fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); cudaFree(I); cudaFree(J); cudaFree(val); cudaFree(x); cudaFree(r); cudaFree(p); cudaFree(Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE"); exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; // Allocate timer.start("Allocation"); int n_flow_vectors = read_input_size(p); int best_model = -1; int best_outliers = n_flow_vectors; #ifdef CUDA_8_0 flowvector *h_flow_vector_array; cudaStatus = cudaMallocManaged(&h_flow_vector_array, n_flow_vectors * sizeof(flowvector)); int *h_random_numbers; cudaStatus = cudaMallocManaged(&h_random_numbers, 2 * p.max_iter * sizeof(int)); int *h_model_candidate; cudaStatus = cudaMallocManaged(&h_model_candidate, p.max_iter * sizeof(int)); int *h_outliers_candidate; cudaStatus = cudaMallocManaged(&h_outliers_candidate, p.max_iter * sizeof(int)); float *h_model_param_local; cudaStatus = cudaMallocManaged(&h_model_param_local, 4 * p.max_iter * sizeof(float)); std::atomic_int *h_g_out_id; cudaStatus = cudaMallocManaged(&h_g_out_id, sizeof(std::atomic_int)); flowvector * d_flow_vector_array = h_flow_vector_array; int * d_random_numbers = h_random_numbers; int * d_model_candidate = h_model_candidate; int * d_outliers_candidate = h_outliers_candidate; float * d_model_param_local = h_model_param_local; std::atomic_int *d_g_out_id = h_g_out_id; std::atomic_int * worklist; cudaStatus = cudaMallocManaged(&worklist, sizeof(std::atomic_int)); #else flowvector * h_flow_vector_array = (flowvector *)malloc(n_flow_vectors * sizeof(flowvector)); int * h_random_numbers = (int *)malloc(2 * p.max_iter * sizeof(int)); int * h_model_candidate = (int *)malloc(p.max_iter * sizeof(int)); int * h_outliers_candidate = (int *)malloc(p.max_iter * sizeof(int)); float * h_model_param_local = (float *)malloc(4 * p.max_iter * sizeof(float)); std::atomic_int *h_g_out_id = (std::atomic_int *)malloc(sizeof(std::atomic_int)); flowvector * d_flow_vector_array; cudaStatus = cudaMalloc((void**)&d_flow_vector_array, n_flow_vectors * sizeof(flowvector)); int * d_random_numbers; cudaStatus = cudaMalloc((void**)&d_random_numbers, 2 * p.max_iter * sizeof(int)); int * d_model_candidate; cudaStatus = cudaMalloc((void**)&d_model_candidate, p.max_iter * sizeof(int)); int * d_outliers_candidate; cudaStatus = cudaMalloc((void**)&d_outliers_candidate, p.max_iter * sizeof(int)); float * d_model_param_local; cudaStatus = cudaMalloc((void**)&d_model_param_local, 4 * p.max_iter * sizeof(float)); int *d_g_out_id; cudaStatus = cudaMalloc((void**)&d_g_out_id, sizeof(int)); ALLOC_ERR(h_flow_vector_array, h_random_numbers, h_model_candidate, h_outliers_candidate, h_model_param_local, h_g_out_id); #endif CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Allocation"); timer.print("Allocation", 1); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); read_input(h_flow_vector_array, h_random_numbers, p); cudaDeviceSynchronize(); timer.stop("Initialization"); timer.print("Initialization", 1); #ifndef CUDA_8_0 // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_flow_vector_array, h_flow_vector_array, n_flow_vectors * sizeof(flowvector), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_random_numbers, h_random_numbers, 2 * p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); timer.print("Copy To Device", 1); #endif for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { // Reset memset((void *)h_model_candidate, 0, p.max_iter * sizeof(int)); memset((void *)h_outliers_candidate, 0, p.max_iter * sizeof(int)); memset((void *)h_model_param_local, 0, 4 * p.max_iter * sizeof(float)); #ifdef CUDA_8_0 h_g_out_id[0].store(0); if(p.alpha < 0.0 || p.alpha > 1.0) { // Dynamic partitioning worklist[0].store(0); } #else h_g_out_id[0] = 0; cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice); CUDA_ERR(); #endif cudaDeviceSynchronize(); if(rep >= p.n_warmup) timer.start("Kernel"); // Launch GPU threads // Kernel launch if(p.n_gpu_blocks > 0) { assert(p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_RANSAC_kernel_block(p.n_gpu_blocks, p.n_gpu_threads, n_flow_vectors, p.max_iter, p.error_threshold, p.convergence_threshold, p.max_iter, p.alpha, d_model_param_local, d_flow_vector_array, d_random_numbers, d_model_candidate, d_outliers_candidate, (int*)d_g_out_id, sizeof(int) #ifdef CUDA_8_0 + sizeof(int), (int*)worklist #endif ); CUDA_ERR(); } // Launch CPU threads std::thread main_thread(run_cpu_threads, h_model_candidate, h_outliers_candidate, h_model_param_local, h_flow_vector_array, n_flow_vectors, h_random_numbers, p.max_iter, p.error_threshold, p.convergence_threshold, h_g_out_id, p.n_threads, p.max_iter, p.alpha #ifdef CUDA_8_0 , worklist); #else ); #endif cudaDeviceSynchronize(); main_thread.join(); if(rep >= p.n_warmup) timer.stop("Kernel"); #ifndef CUDA_8_0 // Copy back if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); int d_candidates = 0; if(p.alpha < 1.0) { cudaStatus = cudaMemcpy(&d_candidates, d_g_out_id, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(&h_model_candidate[h_g_out_id[0]], d_model_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(&h_outliers_candidate[h_g_out_id[0]], d_outliers_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost); CUDA_ERR(); } h_g_out_id[0] += d_candidates; cudaDeviceSynchronize(); if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); #endif // Post-processing (chooses the best model among the candidates) if(rep >= p.n_warmup) timer.start("Kernel"); for(int i = 0; i < h_g_out_id[0]; i++) { if(h_outliers_candidate[i] < best_outliers) { best_outliers = h_outliers_candidate[i]; best_model = h_model_candidate[i]; } } if(rep >= p.n_warmup) timer.stop("Kernel"); }