예제 #1
0
void _XMP_reduce_gpu_CLAUSE(void *dev_addr, int count, int datatype, int op) {
  // setup information
  MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL;
  size_t datatype_size = 0;
  MPI_Op mpi_op;
  _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype);
  _XMP_setup_reduce_op(&mpi_op, op);

  size_t size = datatype_size * count;
  void *host_buf = _XMP_alloc(size);
  cudaError_t e;

  // copy dev to host
  e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost);
  cudaErrorCheck(e);

  // reduce
  MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)(_XMP_get_execution_nodes())->comm));

  // copy host to dev
  e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice);
  cudaErrorCheck(e);

  _XMP_free(host_buf);
}
예제 #2
0
void _XMP_reduce_gpu_NODES_ENTIRE(_XMP_nodes_t *nodes, void *dev_addr, int count, int datatype, int op)
{
  if (count == 0) {
    return; // FIXME not good implementation
  }
  if (!nodes->is_member) {
    return;
  }

  // setup information
  MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL;
  size_t datatype_size = 0;
  MPI_Op mpi_op;
  _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype);
  _XMP_setup_reduce_op(&mpi_op, op);

  size_t size = datatype_size * count;
  void *host_buf = _XMP_alloc(size);
  cudaError_t e;

  // copy dev to host
  e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost);
  cudaErrorCheck(e);

  MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)nodes->comm));

  // copy host to dev
  e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice);
  cudaErrorCheck(e);

  _XMP_free(host_buf);
}
예제 #3
0
void GpuSplineAlgorithm1::set_scatterers(Scatterers::s_ptr new_scatterers) {
    auto scatterers = std::dynamic_pointer_cast<SplineScatterers>(new_scatterers);
    if (!scatterers) {
        throw std::runtime_error("Cast to SplineScatterers failed!");
    }

    m_num_splines = scatterers->num_scatterers();
    if (m_num_splines <= 0) {
        throw std::runtime_error("No scatterers");
    }
    m_spline_degree = scatterers->spline_degree;

    if (m_spline_degree > MAX_SPLINE_DEGREE) {
        throw std::runtime_error("maximum spline degree supported is " + std::to_string(MAX_SPLINE_DEGREE));
    }

    m_num_cs = scatterers->get_num_control_points();
    std::cout << "Num spline scatterers: " << m_num_splines << std::endl;
    std::cout << "Allocating memory on host for reorganizing spline data\n";


    // device memory to hold x, y, z components of all spline control points
    const size_t total_num_cs = m_num_splines*m_num_cs;
    const size_t cs_num_bytes = total_num_cs*sizeof(float);
    m_control_xs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(cs_num_bytes));
    m_control_ys = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(cs_num_bytes));
    m_control_zs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(cs_num_bytes));

    // store the control points with correct memory layout of the host
    std::vector<float> host_control_xs(total_num_cs);
    std::vector<float> host_control_ys(total_num_cs);
    std::vector<float> host_control_zs(total_num_cs);
    std::vector<float> host_control_as(m_num_splines);

    for (size_t spline_no = 0; spline_no < m_num_splines; spline_no++) {
        host_control_as[spline_no] = scatterers->amplitudes[spline_no];
        for (size_t i = 0; i < m_num_cs; i++) {
            const size_t offset = spline_no + i*m_num_splines;
            host_control_xs[offset] = scatterers->control_points[spline_no][i].x;
            host_control_ys[offset] = scatterers->control_points[spline_no][i].y;
            host_control_zs[offset] = scatterers->control_points[spline_no][i].z;
        }
    }
    
    // copy control points to GPU memory.
    cudaErrorCheck( cudaMemcpy(m_control_xs->data(), host_control_xs.data(), cs_num_bytes, cudaMemcpyHostToDevice) );
    cudaErrorCheck( cudaMemcpy(m_control_ys->data(), host_control_ys.data(), cs_num_bytes, cudaMemcpyHostToDevice) );
    cudaErrorCheck( cudaMemcpy(m_control_zs->data(), host_control_zs.data(), cs_num_bytes, cudaMemcpyHostToDevice) );

    // device memory to hold x, y, z, a components of rendered splines
    size_t rendered_num_bytes = m_num_splines*sizeof(float);
    m_fixed_alg->m_device_point_xs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes));
    m_fixed_alg->m_device_point_ys = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes));
    m_fixed_alg->m_device_point_zs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes));
    m_fixed_alg->m_device_point_as = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes));
    m_fixed_alg->m_num_scatterers = m_num_splines;

    // copy amplitudes directly from host memory.
    cudaErrorCheck( cudaMemcpy(m_fixed_alg->m_device_point_as->data(), host_control_as.data(), rendered_num_bytes, cudaMemcpyHostToDevice) );

    // Store the knot vector.
    m_common_knots = scatterers->knot_vector;
}
예제 #4
0
void GpuSplineAlgorithm1::set_scan_sequence(ScanSequence::s_ptr new_scan_sequence) {
    //EventTimerRAII event_timer;
    //event_timer.restart();

    // all lines in the scan sequence have the same timestamp
    if (!has_equal_timestamps(new_scan_sequence)) {
        throw std::runtime_error("scan sequences must currently have equal timestamps");
    }

    m_fixed_alg->set_scan_sequence(new_scan_sequence);

    // Ensure that set_scatterers() has been called first
    if (m_common_knots.size() == 0) {
        throw std::runtime_error("set_scatterers() must be called before set_scan_sequence");
    }

    const auto num_lines = new_scan_sequence->get_num_lines();
    if (num_lines <= 0) {
        throw std::runtime_error("No scanlines");
    }
    // HACK: using parameter value from first scanline
    const float PARAMETER_VAL = new_scan_sequence->get_scanline(0).get_timestamp();



    // evaluate the basis functions and upload to constant memory - always at max degree+1
    // basis functions that are non-zero at any parameter value.
    int cs_idx_start, cs_idx_end;
    std::tie(cs_idx_start, cs_idx_end) = bspline_storve::get_lower_upper_inds(m_common_knots,
                                                                              PARAMETER_VAL,
                                                                              m_spline_degree);
    const auto num_nonzero = cs_idx_end-cs_idx_start+1;
    if (num_nonzero != m_spline_degree+1) throw std::logic_error("illegal number of non-zero basis functions");

    // evaluate all basis functions since it will be checked that the ones supposed to
    // be zero in fact are zero.
    std::vector<float> host_basis_functions(m_num_cs); // TODO: move to set_scatterers()?
    for (int i = 0; i < m_num_cs; i++) {
        host_basis_functions[i] = bspline_storve::bsplineBasis(i, m_spline_degree, PARAMETER_VAL, m_common_knots);
    }
    
    if (!sanity_check_spline_lower_upper_bound(host_basis_functions, cs_idx_start, cs_idx_end)) {
        throw std::runtime_error("b-spline basis bounds failed sanity check");
    }
    
    // only copy the non-zero-basis functions
    const auto src_ptr = host_basis_functions.data() + cs_idx_start;
    if (!splineAlg1_updateConstantMemory(src_ptr, num_nonzero*sizeof(float))) {
        throw std::runtime_error("Failed copying to symbol memory");   
    }
    
    int num_threads = 128;
    int num_blocks = round_up_div(m_num_splines, num_threads);
    //dim3 grid_size(num_blocks, 1, 1);
    //dim3 block_size(num_threads, 1, 1);
    
    const cudaStream_t cuda_stream = 0;
    launch_RenderSplineKernel(num_blocks, num_threads, cuda_stream,
                              m_control_xs->data(),
                              m_control_ys->data(),
                              m_control_zs->data(),
                              m_fixed_alg->m_device_point_xs->data(),
                              m_fixed_alg->m_device_point_ys->data(),
                              m_fixed_alg->m_device_point_zs->data(),
                              cs_idx_start,
                              cs_idx_end,
                              m_num_splines);
    cudaErrorCheck( cudaDeviceSynchronize() );
    //auto ms = event_timer.stop();
    //std::cout << "GPU spline alg.1 : set_scan_sequence(): rendering spline scatterers took " << ms << " millisec.\n";
}