示例#1
0
int
allocate_managed_buffer (char ** buffer)
{
#ifdef _ENABLE_CUDA_
    cudaError_t cuerr = cudaSuccess;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case cuda:
            cuerr = cudaMallocManaged((void **)buffer, MYBUFSIZE, cudaMemAttachGlobal);

            if (cudaSuccess != cuerr) {
                fprintf(stderr, "Could not allocate device memory\n");
                return 1;
            }
            break;
#endif
        default:
            fprintf(stderr, "Could not allocate device memory\n");
            return 1;

    }
    return 0;
}
示例#2
0
 // handles the allocation and creation of an object:
 // something like:
 // X *x = new X() ~ void *temp = operator new(sizeof(X)); X *x = (temp)X;
 void *operator new(size_t len)
 {
   void *ptr;
   cudaMallocManaged(&ptr, len);
   cudaDeviceSynchronize();
   return ptr;
 }
示例#3
0
void *MemoryPool::pop(size_t s, int loc) {
    void *addr = nullptr;

    if ((s > MIN_BLOCK_SIZE) && (s < MAX_BLOCK_SIZE)) {
        locker_.lock();

        // find MemoryPool block which is not smaller than demand size
        auto pt = pool_.lower_bound(s);

        if (pt != pool_.end()) {
            size_t ts = 0;
            std::tie(ts, addr) = *pt;
            if (ts < s * 2) {
                s = ts;
                pool_.erase(pt);
                pool_depth_ -= s;
            } else {
                addr = nullptr;
            }
        }
        locker_.unlock();
    }

    if (addr == nullptr) {
        try {
#ifdef __CUDA__
            SP_DEVICE_CALL(cudaMallocManaged(&addr, s));
#else
            addr = malloc(s);
#endif
        } catch (std::bad_alloc const &error) { THROW_EXCEPTION_BAD_ALLOC(s); }
    }
    return addr;
}
示例#4
0
文件: mesh.cpp 项目: LBeyer/YACPT
Mesh::Mesh(const std::vector<Vec3>& vertices, const std::vector<Triangle>& faces)
	: vertices(),
	faces(),
	bvh(),
	vertexCount(static_cast<uint32_t>(vertices.size())),
	faceCount(static_cast<uint32_t>(faces.size()))
{
	Vec3* dVertices = nullptr;
	cudaMallocManaged(&dVertices, sizeof(Vec3) * vertexCount);
	cudaMemcpy(dVertices, vertices.data(), sizeof(Vec3) * vertexCount, cudaMemcpyHostToHost);
	this->vertices.reset(dVertices, vertexCount);

	Triangle* dFaces = nullptr;
	cudaMallocManaged(&dFaces, sizeof(Triangle) * faceCount);
	cudaMemcpy(dFaces, faces.data(), sizeof(Triangle) * faceCount, cudaMemcpyHostToHost);
	this->faces.reset(dFaces, faceCount);
}
示例#5
0
void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
{
  void * ptr = NULL;

  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );

  return ptr ;
}
示例#6
0
文件: test-scan.cpp 项目: LLNL/RAJA
 static void SetUpTestCase()
 {
   cudaMallocManaged((void**)&data,
                     sizeof(data_type) * N,
                     cudaMemAttachGlobal);
   std::iota(data, data + N, 1);
   std::shuffle(data, data + N, std::mt19937{std::random_device{}()});
 }
示例#7
0
int main()
{
    int *c;
    GlobalState *gs = new GlobalState;
    CHECK(cudaMallocManaged(&c, sizeof(int)));
    *c = 0;
    return 0;
}
示例#8
0
文件: lpm.c 项目: edenden/ixmap-cuda
int lpm_add(struct lpm_table *table, void *prefix,
            unsigned int prefix_len, unsigned int id,
            void *ptr, struct ixmap_desc *desc)
{
    unsigned int index;
    struct lpm_node *node;
    struct lpm_entry *entry;
    unsigned int range, mask;
    int i, ret, entry_allocated = 0;
    cudaError_t ret_cuda;

    index = lpm_index(prefix, 0, 16);

    if(prefix_len > 16) {
        node = &table->node[index];
        ret = _lpm_add(table, prefix, prefix_len, id,
                       ptr, desc, node, 16);
        if(ret < 0)
            goto err_lpm_add;
    } else {
        range = 1 << (16 - prefix_len);
        mask = ~(range - 1);
        index &= mask;

        for(i = 0; i < range; i++, entry_allocated++) {
            node = &table->node[index | i];

            ret_cuda = cudaMallocManaged((void **)&entry,
                                         sizeof(struct lpm_entry), cudaMemAttachGlobal);
            if(ret_cuda != cudaSuccess)
                goto err_lpm_add_self;

            entry->ptr = ptr;

            ret = lpm_entry_insert(table, &node->head, id,
                                   prefix_len, &entry->list);
            if(ret < 0)
                goto err_entry_insert;

            continue;
err_entry_insert:
            cudaFree(entry);
            goto err_lpm_add_self;
        }
    }

    return 0;

err_lpm_add_self:
    for(i = 0; i < entry_allocated; i++) {
        node = &table->node[index | i];
        lpm_entry_delete(table, &node->head, id, prefix_len);
    }
err_lpm_add:
    return -1;
}
示例#9
0
    KOKKOS_INLINE_FUNCTION
    static T* my_alloc(const int sz) {
#if defined(__CUDACC__) && defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_CUDA_UVM) && !defined( __CUDA_ARCH__ )
      T* m;
      cudaMallocManaged( (void**) &m, sz*sizeof(T), cudaMemAttachGlobal );
#else
      T* m = static_cast<T* >(operator new(sz*sizeof(T)));
#endif
      return m;
    }
示例#10
0
static void *THCUVAAllocator_alloc(void* ctx, ptrdiff_t size) {
  if (size < 0) THError("Invalid memory size: %ld", size);

  if (size == 0) return NULL;

  // See J.1.1 of the CUDA_C_Programming_Guide.pdf for UVA and coherence rules
  // on various compute capabilities.
  void* ptr;
  THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
  return ptr;
}
示例#11
0
文件: cuda_alloc.c 项目: openucx/ucx
static ucs_status_t ucp_perf_cuda_alloc_managed(ucx_perf_context_t *perf,
                                                size_t length, void **address_p,
                                                ucp_mem_h *memh_p, int non_blk_flag)
{
    cudaError_t cerr;

    cerr = cudaMallocManaged(address_p, length, cudaMemAttachGlobal);
    if (cerr != cudaSuccess) {
        return UCS_ERR_NO_MEMORY;
    }

    return UCS_OK;
}
示例#12
0
int
allocate_buffer (void ** buffer, size_t size, enum accel_type type)
{
    if (options.target == cpu || options.target == both) {
        allocate_host_arrays();
    }

    size_t alignment = sysconf(_SC_PAGESIZE);
#ifdef _ENABLE_CUDA_
    cudaError_t cuerr = cudaSuccess;
#endif

    switch (type) {
        case none:
            return posix_memalign(buffer, alignment, size);
#ifdef _ENABLE_CUDA_
        case cuda:
            cuerr = cudaMalloc(buffer, size);
            if (cudaSuccess != cuerr) {
                return 1;
            }

            else {
                return 0;
            }
        case managed:
            cuerr = cudaMallocManaged(buffer, size, cudaMemAttachGlobal);
            if (cudaSuccess != cuerr) {
                return 1;
            }

            else {
                return 0;
            }
#endif
#ifdef _ENABLE_OPENACC_
        case openacc:
            *buffer = acc_malloc(size);
            if (NULL == *buffer) {
                return 1;
            }

            else {
                return 0;
            }
#endif
        default:
            return 1;
    }
}
示例#13
0
文件: test-scan.cpp 项目: LLNL/RAJA
CUDA_TYPED_TEST_P(ScanCUDA, exclusive_inplace_offset)
{
  using T = typename Info<TypeParam>::data_type;
  using Function = typename Info<TypeParam>::function;

  T* data;
  cudaMallocManaged((void**)&data, sizeof(T) * N, cudaMemAttachGlobal);
  std::copy_n(ScanCUDA<TypeParam>::data, N, data);

  RAJA::exclusive_scan_inplace(
      typename Info<TypeParam>::exec(), data, data + N, Function{}, T(2));

  ASSERT_TRUE(check_exclusive<Function>(data, ScanCUDA<TypeParam>::data, T(2)));
  cudaFree(data);
}
示例#14
0
    inline void* allocate(size_t num_bytes)
    {
      // switch to our device
      scoped_device set_current_device(device());

      void* result = nullptr;
  
      cudaError_t error = cudaMallocManaged(&result, num_bytes, cudaMemAttachGlobal);
  
      if(error != cudaSuccess)
      {
        throw thrust::system_error(error, thrust::cuda_category(), "managed_resource::allocate(): cudaMallocManaged");
      }
  
      return result;
    }
示例#15
0
void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
{

  void * ptr = NULL;

  Kokkos::Impl::num_uvm_allocations += 1 ;

  if ( Kokkos::Impl::num_uvm_allocations > 65536 ) {
    Kokkos::Impl::num_uvm_allocations = 0 ; //Reset to 0 before throwing exception
    Kokkos::Impl::throw_runtime_exception( "CudaUVM error: The maximum limit of UVM allocations is 65536" ) ;
  }

  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );

  return ptr ;
}
示例#16
0
文件: test-scan.cpp 项目: LLNL/RAJA
CUDA_TYPED_TEST_P(ScanCUDA, exclusive)
{
  using T = typename Info<TypeParam>::data_type;
  using Function = typename Info<TypeParam>::function;

  T* out;
  cudaMallocManaged((void**)&out, sizeof(T) * N, cudaMemAttachGlobal);

  RAJA::exclusive_scan(typename Info<TypeParam>::exec(),
                       ScanCUDA<TypeParam>::data,
                       ScanCUDA<TypeParam>::data + N,
                       out,
                       Function{});

  ASSERT_TRUE(check_exclusive<Function>(out, ScanCUDA<TypeParam>::data));
  cudaFree(out);
}
示例#17
0
ChannelInfo::ChannelInfo(const std::vector<Channels> &channels, bool use_gpu) : use_gpu(use_gpu) {
    num_channels = (int)channels.size();
    radiance_dimension = -1;
    num_total_dimensions = compute_num_channels(channels);
    if (use_gpu) {
#ifdef __CUDACC__
        checkCuda(cudaMallocManaged(&this->channels, channels.size() * sizeof(Channels)));
#else
        assert(false);
#endif
    } else {
        this->channels = new Channels[channels.size()];
    }
    for (int i = 0; i < (int)channels.size(); i++) {
        if (channels[i] == Channels::radiance) {
            if (radiance_dimension != -1) {
                throw std::runtime_error("Duplicated radiance channel");
            }
            radiance_dimension = i;
        }
        this->channels[i] = channels[i];
    }
}
示例#18
0
文件: lpm.c 项目: edenden/ixmap-cuda
static int _lpm_add(struct lpm_table *table, void *prefix,
                    unsigned int prefix_len, unsigned int id,
                    void *ptr, struct ixmap_desc *desc,
                    struct lpm_node *parent, unsigned int offset)
{
    struct lpm_node *node;
    struct lpm_entry *entry;
    unsigned int index;
    unsigned int range, mask;
    int i, ret, entry_allocated = 0;
    cudaError_t ret_cuda;

    if(!parent->next_table) {
        ret_cuda = cudaMallocManaged((void **)&parent->next_table,
                                     sizeof(struct lpm_node) * TABLE_SIZE_8, cudaMemAttachGlobal);
        if(ret_cuda != cudaSuccess)
            goto err_table_alloc;

        for(i = 0; i < TABLE_SIZE_8; i++) {
            node = &parent->next_table[i];
            lpm_init_node(node);
        }
    }

    index = lpm_index(prefix, offset, 8);

    if(prefix_len - offset > 8) {
        node = &parent->next_table[index];
        ret = _lpm_add(table, prefix, prefix_len, id,
                       ptr, desc, node, offset + 8);
        if(ret < 0)
            goto err_lpm_add;
    } else {
        range = 1 << (8 - (prefix_len - offset));
        mask = ~(range - 1);
        index &= mask;

        for(i = 0; i < range; i++) {
            node = &parent->next_table[index | i];

            ret_cuda = cudaMallocManaged((void **)&entry,
                                         sizeof(struct lpm_entry), cudaMemAttachGlobal);
            if(ret_cuda != cudaSuccess)
                goto err_lpm_add_self;

            entry->ptr = ptr;

            ret = lpm_entry_insert(table, &node->head, id,
                                   prefix_len, &entry->list);
            if(ret < 0)
                goto err_entry_insert;

            continue;
err_entry_insert:
            cudaFree(entry);
            goto err_lpm_add_self;
        }
    }

    return 0;

err_lpm_add_self:
    for(i = 0; i < entry_allocated; i++) {
        node = &parent->next_table[index | i];
        lpm_entry_delete(table, &node->head, id, prefix_len);
    }
err_lpm_add:
    for(i = 0; i < TABLE_SIZE_8; i++) {
        node = &parent->next_table[i];
        if(node->next_table || !list_empty(&node->head)) {
            goto err_table_alloc;
        }
    }
    cudaFree(parent->next_table);
    parent->next_table = NULL;
err_table_alloc:
    return -1;
}
示例#19
0
int main(int argc, char** argv){
    double* A;
    double* B;
    double* C;
    
    double alpha = 1.0;
    double beta = 0.0;
    int i;        
    struct timeval t1,t2, t3, t4;
    
    const int SEED = 1;
    const int METHOD = 0;
    const int BRNG = VSL_BRNG_MCG31;
    VSLStreamStatePtr stream;
    int errcode;
    
    cublasStatus_t status;
    cublasHandle_t handle;
    
    double a=0.0, b= 1.0; // Uniform distribution between 0 and 1
    
    errcode = vslNewStream(&stream, BRNG, SEED);
    
    int width = 100;
    if (argc > 1){
        width = atoi(argv[1]);
    }
    /* Allocate memory for A, B, and C */
    if (cudaMallocManaged(&A, width * width * sizeof(double)) != cudaSuccess){
        fprintf(stderr, "!!!! device memory alocation error (allocate A)\n");
        return EXIT_FAILURE;
    }
    if (cudaMallocManaged(&B, width * width * sizeof(double)) != cudaSuccess){
        fprintf(stderr, "!!!! device memory alocation error (allocate B)\n");
        return EXIT_FAILURE;
    }
    if (cudaMallocManaged(&C, width * width * sizeof(double)) != cudaSuccess){
        fprintf(stderr, "!!!! device memory alocation error (allocate C)\n");
        return EXIT_FAILURE;
    }
    /* Generate width * width random numbers between 0 and 1 to fill matrices A and B. */
    errcode = vdRngUniform(METHOD, stream, width * width, A, a, b);
    CheckVslError(errcode);
    errcode = vdRngUniform(METHOD, stream, width * width, B, a, b);
    CheckVslError(errcode);
    
    /* Now prepare the call to CUBLAS */
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }
    gettimeofday(&t3, NULL);

    
    /* Perform calculation */
    status = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, width, width, width, &alpha, A,
        width, B, width, &beta, C, width);
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    cudaDeviceSynchronize(); 
    gettimeofday(&t4, NULL);
    const double time = (double) (t4.tv_sec - t3.tv_sec) + 1e-6 * (t4.tv_usec -
        t3.tv_usec);
    const double Gflops = 2. * width * width * width / (double) time * 10e-9;
    printf("Call to cublasDGEMM took %lf\n", time);
    printf("Gflops: %lf\n", Gflops);
    cudaFree(A);
    cudaFree(B);
    cudaFree(C);
    
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! shutdown error\n");
        return EXIT_FAILURE;
    }
    
    return 0;
}
示例#20
0
int main(int argc, char **argv)
{
    int N = 0, nz = 0, *I = NULL, *J = NULL;
    float *val = NULL;
    const float tol = 1e-5f;
    const int max_iter = 10000;
    float *x;
    float *rhs;
    float a, b, na, r0, r1;
    float dot;
    float *r, *p, *Ax;
    int k;
    float alpha, beta, alpham1;

    printf("Starting [%s]...\n", sSDKname);

    // This will pick the best possible CUDA capable device
    cudaDeviceProp deviceProp;
    int devID = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

#if defined(__APPLE__) || defined(MACOSX)
    fprintf(stderr, "Unified Memory not currently supported on OS X\n");
    cudaDeviceReset();
    exit(EXIT_WAIVED);
#endif

    if (sizeof(void *) != 8)
    {
        fprintf(stderr, "Unified Memory requires compiling for a 64-bit system.\n");
        cudaDeviceReset();
        exit(EXIT_WAIVED);
    }

    if (((deviceProp.major << 4) + deviceProp.minor) < 0x30)
    {
        fprintf(stderr, "%s requires Compute Capability of SM 3.0 or higher to run.\nexiting...\n", argv[0]);

        cudaDeviceReset();
        exit(EXIT_WAIVED);
    }

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    /* Generate a random tridiagonal symmetric matrix in CSR format */
    N = 1048576;
    nz = (N-2)*3 + 4;

    cudaMallocManaged((void **)&I, sizeof(int)*(N+1));
    cudaMallocManaged((void **)&J, sizeof(int)*nz);
    cudaMallocManaged((void **)&val, sizeof(float)*nz);

    genTridiag(I, J, val, N, nz);

    cudaMallocManaged((void **)&x, sizeof(float)*N);
    cudaMallocManaged((void **)&rhs, sizeof(float)*N);

    for (int i = 0; i < N; i++)
    {
        rhs[i] = 1.0;
        x[i] = 0.0;
    }

    /* Get handle to the CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Get handle to the CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    // temp memory for CG
    checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float)));
    checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float)));
    checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float)));

    cudaDeviceSynchronize();

    for (int i=0; i < N; i++)
    {
        r[i] = rhs[i];
    }

    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;

    cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax);

    cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1);
    cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1);

    k = 1;

    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1);
            cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1);
        }
        else
        {
            cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1);
        }

        cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax);
        cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot);
        a = r1 / dot;

        cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1);
        na = -a;
        cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1);

        r0 = r1;
        cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1);
        cudaThreadSynchronize();
        printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }

    printf("Final residual: %e\n",sqrt(r1));

    fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED");

    float rsum, diff, err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

    cudaFree(I);
    cudaFree(J);
    cudaFree(val);
    cudaFree(x);
    cudaFree(r);
    cudaFree(p);
    cudaFree(Ax);

    cudaDeviceReset();

    printf("Test Summary:  Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE");
    exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE);
}
示例#21
0
文件: main.cpp 项目: jjiantong/chai
// Main ------------------------------------------------------------------------------------------
int main(int argc, char **argv) {

    const Params p(argc, argv);
    CUDASetup    setcuda(p.device);
    Timer        timer;
    cudaError_t  cudaStatus;

    // Allocate
    timer.start("Allocation");
    int n_flow_vectors = read_input_size(p);
    int best_model     = -1;
    int best_outliers  = n_flow_vectors;
#ifdef CUDA_8_0
    flowvector *h_flow_vector_array;
    cudaStatus = cudaMallocManaged(&h_flow_vector_array, n_flow_vectors * sizeof(flowvector));
    int *h_random_numbers;
    cudaStatus = cudaMallocManaged(&h_random_numbers, 2 * p.max_iter * sizeof(int));
    int *h_model_candidate;
    cudaStatus = cudaMallocManaged(&h_model_candidate, p.max_iter * sizeof(int));
    int *h_outliers_candidate;
    cudaStatus = cudaMallocManaged(&h_outliers_candidate, p.max_iter * sizeof(int));
    float *h_model_param_local;
    cudaStatus = cudaMallocManaged(&h_model_param_local, 4 * p.max_iter * sizeof(float));
    std::atomic_int *h_g_out_id;
    cudaStatus = cudaMallocManaged(&h_g_out_id, sizeof(std::atomic_int));
    flowvector *     d_flow_vector_array  = h_flow_vector_array;
    int *            d_random_numbers     = h_random_numbers;
    int *            d_model_candidate    = h_model_candidate;
    int *            d_outliers_candidate = h_outliers_candidate;
    float *          d_model_param_local  = h_model_param_local;
    std::atomic_int *d_g_out_id           = h_g_out_id;
    std::atomic_int * worklist;
    cudaStatus = cudaMallocManaged(&worklist, sizeof(std::atomic_int));
#else
    flowvector *     h_flow_vector_array  = (flowvector *)malloc(n_flow_vectors * sizeof(flowvector));
    int *            h_random_numbers     = (int *)malloc(2 * p.max_iter * sizeof(int));
    int *            h_model_candidate    = (int *)malloc(p.max_iter * sizeof(int));
    int *            h_outliers_candidate = (int *)malloc(p.max_iter * sizeof(int));
    float *          h_model_param_local  = (float *)malloc(4 * p.max_iter * sizeof(float));
    std::atomic_int *h_g_out_id           = (std::atomic_int *)malloc(sizeof(std::atomic_int));
    flowvector *     d_flow_vector_array;
    cudaStatus = cudaMalloc((void**)&d_flow_vector_array, n_flow_vectors * sizeof(flowvector));
    int *            d_random_numbers;
    cudaStatus = cudaMalloc((void**)&d_random_numbers, 2 * p.max_iter * sizeof(int));
    int *            d_model_candidate;
    cudaStatus = cudaMalloc((void**)&d_model_candidate, p.max_iter * sizeof(int));
    int *            d_outliers_candidate;
    cudaStatus = cudaMalloc((void**)&d_outliers_candidate, p.max_iter * sizeof(int));
    float *          d_model_param_local;
    cudaStatus = cudaMalloc((void**)&d_model_param_local, 4 * p.max_iter * sizeof(float));
    int *d_g_out_id;
    cudaStatus = cudaMalloc((void**)&d_g_out_id, sizeof(int));
    ALLOC_ERR(h_flow_vector_array, h_random_numbers, h_model_candidate, h_outliers_candidate, h_model_param_local, h_g_out_id);
#endif
    CUDA_ERR();
    cudaDeviceSynchronize();
    timer.stop("Allocation");
    timer.print("Allocation", 1);

    // Initialize
    timer.start("Initialization");
    const int max_gpu_threads = setcuda.max_gpu_threads();
    read_input(h_flow_vector_array, h_random_numbers, p);
    cudaDeviceSynchronize();
    timer.stop("Initialization");
    timer.print("Initialization", 1);

#ifndef CUDA_8_0
    // Copy to device
    timer.start("Copy To Device");
    cudaStatus = cudaMemcpy(d_flow_vector_array, h_flow_vector_array, n_flow_vectors * sizeof(flowvector), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_random_numbers, h_random_numbers, 2 * p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    CUDA_ERR();
    timer.stop("Copy To Device");
    timer.print("Copy To Device", 1);
#endif

    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {

        // Reset
        memset((void *)h_model_candidate, 0, p.max_iter * sizeof(int));
        memset((void *)h_outliers_candidate, 0, p.max_iter * sizeof(int));
        memset((void *)h_model_param_local, 0, 4 * p.max_iter * sizeof(float));
#ifdef CUDA_8_0
        h_g_out_id[0].store(0);
        if(p.alpha < 0.0 || p.alpha > 1.0) { // Dynamic partitioning
            worklist[0].store(0);
        }
#else
        h_g_out_id[0] = 0;
        cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
        cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
        cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice);
        cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice);
        CUDA_ERR();
#endif
        cudaDeviceSynchronize();

        if(rep >= p.n_warmup)
            timer.start("Kernel");

        // Launch GPU threads
        // Kernel launch
        if(p.n_gpu_blocks > 0) {
            assert(p.n_gpu_threads <= max_gpu_threads && 
                "The thread block size is greater than the maximum thread block size that can be used on this device");
            cudaStatus = call_RANSAC_kernel_block(p.n_gpu_blocks, p.n_gpu_threads, n_flow_vectors, p.max_iter, 
                p.error_threshold, p.convergence_threshold, p.max_iter, p.alpha, d_model_param_local, 
                d_flow_vector_array, d_random_numbers, d_model_candidate, d_outliers_candidate, (int*)d_g_out_id, 
                sizeof(int)
#ifdef CUDA_8_0
                + sizeof(int), (int*)worklist
#endif
                );
            CUDA_ERR();
        }
        // Launch CPU threads
        std::thread main_thread(run_cpu_threads, h_model_candidate, h_outliers_candidate, h_model_param_local,
            h_flow_vector_array, n_flow_vectors, h_random_numbers, p.max_iter, p.error_threshold,
            p.convergence_threshold, h_g_out_id, p.n_threads, p.max_iter, p.alpha
#ifdef CUDA_8_0
            ,
            worklist);
#else
            );
#endif

        cudaDeviceSynchronize();
        main_thread.join();

        if(rep >= p.n_warmup)
            timer.stop("Kernel");

#ifndef CUDA_8_0
        // Copy back
        if(rep >= p.n_warmup)
            timer.start("Copy Back and Merge");
        int d_candidates = 0;
        if(p.alpha < 1.0) {
            cudaStatus = cudaMemcpy(&d_candidates, d_g_out_id, sizeof(int), cudaMemcpyDeviceToHost);
            cudaStatus = cudaMemcpy(&h_model_candidate[h_g_out_id[0]], d_model_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost);
            cudaStatus = cudaMemcpy(&h_outliers_candidate[h_g_out_id[0]], d_outliers_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost);
            CUDA_ERR();
        }
        h_g_out_id[0] += d_candidates;
        cudaDeviceSynchronize();
        if(rep >= p.n_warmup)
            timer.stop("Copy Back and Merge");
#endif

        // Post-processing (chooses the best model among the candidates)
        if(rep >= p.n_warmup)
            timer.start("Kernel");
        for(int i = 0; i < h_g_out_id[0]; i++) {
            if(h_outliers_candidate[i] < best_outliers) {
                best_outliers = h_outliers_candidate[i];
                best_model    = h_model_candidate[i];
            }
        }
        if(rep >= p.n_warmup)
            timer.stop("Kernel");
    }