void _XMP_reduce_gpu_CLAUSE(void *dev_addr, int count, int datatype, int op) { // setup information MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL; size_t datatype_size = 0; MPI_Op mpi_op; _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype); _XMP_setup_reduce_op(&mpi_op, op); size_t size = datatype_size * count; void *host_buf = _XMP_alloc(size); cudaError_t e; // copy dev to host e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost); cudaErrorCheck(e); // reduce MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)(_XMP_get_execution_nodes())->comm)); // copy host to dev e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice); cudaErrorCheck(e); _XMP_free(host_buf); }
void _XMP_reduce_gpu_NODES_ENTIRE(_XMP_nodes_t *nodes, void *dev_addr, int count, int datatype, int op) { if (count == 0) { return; // FIXME not good implementation } if (!nodes->is_member) { return; } // setup information MPI_Datatype mpi_datatype = MPI_DATATYPE_NULL; size_t datatype_size = 0; MPI_Op mpi_op; _XMP_setup_reduce_type(&mpi_datatype, &datatype_size, datatype); _XMP_setup_reduce_op(&mpi_op, op); size_t size = datatype_size * count; void *host_buf = _XMP_alloc(size); cudaError_t e; // copy dev to host e = cudaMemcpy(host_buf, dev_addr, size, cudaMemcpyDeviceToHost); cudaErrorCheck(e); MPI_Allreduce(MPI_IN_PLACE, host_buf, count, mpi_datatype, mpi_op, *((MPI_Comm *)nodes->comm)); // copy host to dev e = cudaMemcpy(dev_addr, host_buf, size, cudaMemcpyHostToDevice); cudaErrorCheck(e); _XMP_free(host_buf); }
void GpuSplineAlgorithm1::set_scatterers(Scatterers::s_ptr new_scatterers) { auto scatterers = std::dynamic_pointer_cast<SplineScatterers>(new_scatterers); if (!scatterers) { throw std::runtime_error("Cast to SplineScatterers failed!"); } m_num_splines = scatterers->num_scatterers(); if (m_num_splines <= 0) { throw std::runtime_error("No scatterers"); } m_spline_degree = scatterers->spline_degree; if (m_spline_degree > MAX_SPLINE_DEGREE) { throw std::runtime_error("maximum spline degree supported is " + std::to_string(MAX_SPLINE_DEGREE)); } m_num_cs = scatterers->get_num_control_points(); std::cout << "Num spline scatterers: " << m_num_splines << std::endl; std::cout << "Allocating memory on host for reorganizing spline data\n"; // device memory to hold x, y, z components of all spline control points const size_t total_num_cs = m_num_splines*m_num_cs; const size_t cs_num_bytes = total_num_cs*sizeof(float); m_control_xs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(cs_num_bytes)); m_control_ys = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(cs_num_bytes)); m_control_zs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(cs_num_bytes)); // store the control points with correct memory layout of the host std::vector<float> host_control_xs(total_num_cs); std::vector<float> host_control_ys(total_num_cs); std::vector<float> host_control_zs(total_num_cs); std::vector<float> host_control_as(m_num_splines); for (size_t spline_no = 0; spline_no < m_num_splines; spline_no++) { host_control_as[spline_no] = scatterers->amplitudes[spline_no]; for (size_t i = 0; i < m_num_cs; i++) { const size_t offset = spline_no + i*m_num_splines; host_control_xs[offset] = scatterers->control_points[spline_no][i].x; host_control_ys[offset] = scatterers->control_points[spline_no][i].y; host_control_zs[offset] = scatterers->control_points[spline_no][i].z; } } // copy control points to GPU memory. cudaErrorCheck( cudaMemcpy(m_control_xs->data(), host_control_xs.data(), cs_num_bytes, cudaMemcpyHostToDevice) ); cudaErrorCheck( cudaMemcpy(m_control_ys->data(), host_control_ys.data(), cs_num_bytes, cudaMemcpyHostToDevice) ); cudaErrorCheck( cudaMemcpy(m_control_zs->data(), host_control_zs.data(), cs_num_bytes, cudaMemcpyHostToDevice) ); // device memory to hold x, y, z, a components of rendered splines size_t rendered_num_bytes = m_num_splines*sizeof(float); m_fixed_alg->m_device_point_xs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes)); m_fixed_alg->m_device_point_ys = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes)); m_fixed_alg->m_device_point_zs = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes)); m_fixed_alg->m_device_point_as = DeviceBufferRAII<float>::u_ptr(new DeviceBufferRAII<float>(rendered_num_bytes)); m_fixed_alg->m_num_scatterers = m_num_splines; // copy amplitudes directly from host memory. cudaErrorCheck( cudaMemcpy(m_fixed_alg->m_device_point_as->data(), host_control_as.data(), rendered_num_bytes, cudaMemcpyHostToDevice) ); // Store the knot vector. m_common_knots = scatterers->knot_vector; }
void GpuSplineAlgorithm1::set_scan_sequence(ScanSequence::s_ptr new_scan_sequence) { //EventTimerRAII event_timer; //event_timer.restart(); // all lines in the scan sequence have the same timestamp if (!has_equal_timestamps(new_scan_sequence)) { throw std::runtime_error("scan sequences must currently have equal timestamps"); } m_fixed_alg->set_scan_sequence(new_scan_sequence); // Ensure that set_scatterers() has been called first if (m_common_knots.size() == 0) { throw std::runtime_error("set_scatterers() must be called before set_scan_sequence"); } const auto num_lines = new_scan_sequence->get_num_lines(); if (num_lines <= 0) { throw std::runtime_error("No scanlines"); } // HACK: using parameter value from first scanline const float PARAMETER_VAL = new_scan_sequence->get_scanline(0).get_timestamp(); // evaluate the basis functions and upload to constant memory - always at max degree+1 // basis functions that are non-zero at any parameter value. int cs_idx_start, cs_idx_end; std::tie(cs_idx_start, cs_idx_end) = bspline_storve::get_lower_upper_inds(m_common_knots, PARAMETER_VAL, m_spline_degree); const auto num_nonzero = cs_idx_end-cs_idx_start+1; if (num_nonzero != m_spline_degree+1) throw std::logic_error("illegal number of non-zero basis functions"); // evaluate all basis functions since it will be checked that the ones supposed to // be zero in fact are zero. std::vector<float> host_basis_functions(m_num_cs); // TODO: move to set_scatterers()? for (int i = 0; i < m_num_cs; i++) { host_basis_functions[i] = bspline_storve::bsplineBasis(i, m_spline_degree, PARAMETER_VAL, m_common_knots); } if (!sanity_check_spline_lower_upper_bound(host_basis_functions, cs_idx_start, cs_idx_end)) { throw std::runtime_error("b-spline basis bounds failed sanity check"); } // only copy the non-zero-basis functions const auto src_ptr = host_basis_functions.data() + cs_idx_start; if (!splineAlg1_updateConstantMemory(src_ptr, num_nonzero*sizeof(float))) { throw std::runtime_error("Failed copying to symbol memory"); } int num_threads = 128; int num_blocks = round_up_div(m_num_splines, num_threads); //dim3 grid_size(num_blocks, 1, 1); //dim3 block_size(num_threads, 1, 1); const cudaStream_t cuda_stream = 0; launch_RenderSplineKernel(num_blocks, num_threads, cuda_stream, m_control_xs->data(), m_control_ys->data(), m_control_zs->data(), m_fixed_alg->m_device_point_xs->data(), m_fixed_alg->m_device_point_ys->data(), m_fixed_alg->m_device_point_zs->data(), cs_idx_start, cs_idx_end, m_num_splines); cudaErrorCheck( cudaDeviceSynchronize() ); //auto ms = event_timer.stop(); //std::cout << "GPU spline alg.1 : set_scan_sequence(): rendering spline scatterers took " << ms << " millisec.\n"; }