void OCLAcceleratorMatrixMCSR<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar, BaseVector<ValueType> *out) const { if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out) ; assert(cast_in != NULL); assert(cast_out!= NULL); int nrow = this->get_nrow(); cl_int err; cl_event ocl_event; size_t localWorkSize[1]; size_t globalWorkSize[1]; err = clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 0, sizeof(int), (void *) &nrow ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 1, sizeof(cl_mem), (void *) this->mat_.row_offset ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 2, sizeof(cl_mem), (void *) this->mat_.col ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 3, sizeof(cl_mem), (void *) this->mat_.val ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 4, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 5, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 6, sizeof(cl_mem), (void *) cast_out->vec_ ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); localWorkSize[0] = this->local_backend_.OCL_max_work_group_size; localWorkSize[0] /= 2; globalWorkSize[0] = ( size_t( nrow / localWorkSize[0] ) + 1 ) * localWorkSize[0]; err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); } }
void OCLAcceleratorMatrixBCSR<ValueType>::Apply(const BaseVector<ValueType> &in, BaseVector<ValueType> *out) const { /* assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out) ; assert(cast_in != NULL); assert(cast_out!= NULL); */ FATAL_ERROR(__FILE__, __LINE__); // to avoid compiler warnings int err; CHECK_OCL_ERROR(err, __FILE__, __LINE__); }
void OCLAcceleratorMatrixCOO<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar, BaseVector<ValueType> *out) const { // TODO some devices hang up while waiting for COO_SPMV_FLAT event to finish // this is a bug we are fixing in some future release if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out) ; assert(cast_in != NULL); assert(cast_out!= NULL); // do not support super small matrices assert(this->get_nnz() > OPENCL_WARPSIZE); // ---------------------------------------------------------- // Modified and adopted from CUSP 0.3.1, // http://code.google.com/p/cusp-library/ // NVIDIA, APACHE LICENSE 2.0 // ---------------------------------------------------------- // see __spmv_coo_flat(...) // ---------------------------------------------------------- // CHANGELOG // - adopted interface // ---------------------------------------------------------- //TODO //move in extra file - max_active_blocks, warp_size, block_size const unsigned int BLOCK_SIZE = this->local_backend_.OCL_max_work_group_size; // const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks; const unsigned int MAX_BLOCKS = 32; // cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0); const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / OPENCL_WARPSIZE; const unsigned int num_units = this->get_nnz() / OPENCL_WARPSIZE; const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS); const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity const unsigned int num_iters = (num_units + (num_warps-1)) / num_warps; const unsigned int interval_size = OPENCL_WARPSIZE * num_iters; const int tail = num_units * OPENCL_WARPSIZE; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_wrap elements) const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size); cl_mem *temp_rows = NULL; cl_mem *temp_vals = NULL; allocate_ocl<int> (active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows); allocate_ocl<ValueType>(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals); // LOG_INFO("active_warps = " << active_warps); // LOG_INFO("tail =" << tail); // LOG_INFO("interval_size =" << interval_size); // LOG_INFO("num_iters =" << num_iters); // LOG_INFO("num_blocks =" << num_blocks); // LOG_INFO("num_warps =" << num_warps); // LOG_INFO("num_units =" << num_units); // LOG_INFO("WARPS_PER_BLOCK =" << WARPS_PER_BLOCK); // LOG_INFO("MAX_BLOCKS =" << MAX_BLOCKS); // LOG_INFO("BLOCK_SIZE =" << BLOCK_SIZE); // LOG_INFO("WARP_SIZE =" << WARP_SIZE); // LOG_INFO("WARP_SIZE =" << this->local_backend_.GPU_wrap); cl_int err; cl_event ocl_event; size_t localWorkSize[1]; size_t globalWorkSize[1]; // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 0, sizeof(int), (void *) &tail ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 1, sizeof(int), (void *) &interval_size ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 2, sizeof(cl_mem), (void *) this->mat_.row ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 3, sizeof(cl_mem), (void *) this->mat_.col ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 4, sizeof(cl_mem), (void *) this->mat_.val ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 5, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 6, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 7, sizeof(cl_mem), (void *) cast_out->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 8, sizeof(cl_mem), (void *) temp_rows ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 9, sizeof(cl_mem), (void *) temp_vals ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine local work size for kernel call localWorkSize[0] = BLOCK_SIZE; // Determine global work size for kernel call globalWorkSize[0] = num_blocks * localWorkSize[0]; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_FLAT, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 0, sizeof(int), (void *) &active_warps ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, sizeof(cl_mem), (void *) temp_rows ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 2, sizeof(cl_mem), (void *) temp_vals ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 3, sizeof(cl_mem), (void *) cast_out->vec_ ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine global work size for kernel call globalWorkSize[0] = localWorkSize[0]; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); int nnz = this->get_nnz(); // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 0, sizeof(int), (void *) &nnz ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 1, sizeof(cl_mem), (void *) this->mat_.row ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 2, sizeof(cl_mem), (void *) this->mat_.col ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 3, sizeof(cl_mem), (void *) this->mat_.val ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 4, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 5, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 6, sizeof(cl_mem), (void *) cast_out->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 7, sizeof(int), (void *) &tail ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine local work size for kernel call localWorkSize[0] = 1; // Determine global work size for kernel call globalWorkSize[0] = 1; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_SERIAL, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); free_ocl(&temp_rows); free_ocl(&temp_vals); } }