void OCLAcceleratorMatrixCOO<ValueType>::CopyFrom(const BaseMatrix<ValueType> &src) { const OCLAcceleratorMatrixCOO<ValueType> *ocl_cast_mat; const HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // OCL to OCL copy if ((ocl_cast_mat = dynamic_cast<const OCLAcceleratorMatrixCOO<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateCOO(src.get_nnz(), src.get_nrow(), src.get_ncol() ); assert((this->get_nnz() == src.get_nnz()) && (this->get_nrow() == src.get_nrow()) && (this->get_ncol() == src.get_ncol()) ); if (this->get_nnz() > 0) { // Copy object from device to device memory (internal copy) ocl_dev2dev<int>(this->get_nnz(), // size ocl_cast_mat->mat_.row, // src this->mat_.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); // Copy object from device to device memory (internal copy) ocl_dev2dev<int>(this->get_nnz(), // size ocl_cast_mat->mat_.col, // src this->mat_.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); // Copy object from device to device memory (internal copy) ocl_dev2dev<ValueType>(this->get_nnz(), // size ocl_cast_mat->mat_.val, // src this->mat_.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); } } else { //CPU to OCL if ((host_cast_mat = dynamic_cast<const HostMatrix<ValueType>*> (&src)) != NULL) { this->CopyFromHost(*host_cast_mat); } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } }
void OCLAcceleratorMatrixMCSR<ValueType>::CopyTo(BaseMatrix<ValueType> *dst) const { OCLAcceleratorMatrixMCSR<ValueType> *ocl_cast_mat; HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == dst->get_mat_format()); // OCL to OCL copy if ((ocl_cast_mat = dynamic_cast<OCLAcceleratorMatrixMCSR<ValueType>*> (dst)) != NULL) { ocl_cast_mat->set_backend(this->local_backend_); if (this->get_nnz() == 0) ocl_cast_mat->AllocateMCSR(dst->get_nnz(), dst->get_nrow(), dst->get_ncol() ); assert((this->get_nnz() == dst->get_nnz()) && (this->get_nrow() == dst->get_nrow()) && (this->get_ncol() == dst->get_ncol()) ); if (this->get_nnz() > 0) { // must be within same opencl context ocl_dev2dev<int>(this->get_nrow()+1, // size this->mat_.row_offset, // src ocl_cast_mat->mat_.row_offset, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_dev2dev<int>(this->get_nnz(), // size this->mat_.col, // src ocl_cast_mat->mat_.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_dev2dev<ValueType>(this->get_nnz(), // size this->mat_.val, // src ocl_cast_mat->mat_.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); } } else { //OCL to CPU if ((host_cast_mat = dynamic_cast<HostMatrix<ValueType>*> (dst)) != NULL) { this->CopyToHost(host_cast_mat); } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } }
void OCLAcceleratorMatrixCOO<ValueType>::CopyToHost(HostMatrix<ValueType> *dst) const { HostMatrixCOO<ValueType> *cast_mat; // copy only in the same format assert(this->get_mat_format() == dst->get_mat_format()); // OCL to CPU copy if ((cast_mat = dynamic_cast<HostMatrixCOO<ValueType>*> (dst)) != NULL) { cast_mat->set_backend(this->local_backend_); if (dst->get_nnz() == 0) cast_mat->AllocateCOO(this->get_nnz(), this->get_nrow(), this->get_ncol() ); if (this->get_nnz() > 0) { assert((this->get_nnz() == dst->get_nnz()) && (this->get_nrow() == dst->get_nrow()) && (this->get_ncol() == dst->get_ncol()) ); // Copy object from device to host memory ocl_dev2host<int>(this->get_nnz(), // size this->mat_.row, // src cast_mat->mat_.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); // Copy object from device to host memory ocl_dev2host<int>(this->get_nnz(), // size this->mat_.col, // src cast_mat->mat_.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); // Copy object from device to host memory ocl_dev2host<ValueType>(this->get_nnz(), // size this->mat_.val, // src cast_mat->mat_.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); } } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } }
void OCLAcceleratorMatrixMCSR<ValueType>::CopyFromHost(const HostMatrix<ValueType> &src) { const HostMatrixMCSR<ValueType> *cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // CPU to OCL copy if ((cast_mat = dynamic_cast<const HostMatrixMCSR<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateMCSR(src.get_nnz(), src.get_nrow(), src.get_ncol() ); assert((this->get_nnz() == src.get_nnz()) && (this->get_nrow() == src.get_nrow()) && (this->get_ncol() == src.get_ncol()) ); if (this->get_nnz() > 0) { ocl_host2dev<int>((this->get_nrow()+1), // size cast_mat->mat_.row_offset, // src this->mat_.row_offset, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_host2dev<int>(this->get_nnz(), // size cast_mat->mat_.col, // src this->mat_.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_host2dev<ValueType>(this->get_nnz(), // size cast_mat->mat_.val, // src this->mat_.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); } } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } }
void OCLAcceleratorMatrixMCSR<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar, BaseVector<ValueType> *out) const { if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out) ; assert(cast_in != NULL); assert(cast_out!= NULL); int nrow = this->get_nrow(); cl_int err; cl_event ocl_event; size_t localWorkSize[1]; size_t globalWorkSize[1]; err = clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 0, sizeof(int), (void *) &nrow ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 1, sizeof(cl_mem), (void *) this->mat_.row_offset ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 2, sizeof(cl_mem), (void *) this->mat_.col ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 3, sizeof(cl_mem), (void *) this->mat_.val ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 4, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 5, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 6, sizeof(cl_mem), (void *) cast_out->vec_ ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); localWorkSize[0] = this->local_backend_.OCL_max_work_group_size; localWorkSize[0] /= 2; globalWorkSize[0] = ( size_t( nrow / localWorkSize[0] ) + 1 ) * localWorkSize[0]; err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); } }
void OCLAcceleratorMatrixCOO<ValueType>::AllocateCOO(const int nnz, const int nrow, const int ncol) { assert(nnz >= 0); assert(ncol >= 0); assert(nrow >= 0); if (this->get_nnz() > 0) this->Clear(); if (nnz > 0) { allocate_ocl<int> (nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.row); allocate_ocl<int> (nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.col); allocate_ocl<ValueType>(nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.val); size_t localWorkSize[1]; size_t globalWorkSize[1]; // Determine local work size for kernel call localWorkSize[0] = this->local_backend_.OCL_max_work_group_size; // Determine global work size for kernel call globalWorkSize[0] = ( size_t( nnz / localWorkSize[0] ) + 1 ) * localWorkSize[0]; // Set entries of device object to zero ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], nnz, 0, mat_.row); // Set entries of device object to zero ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], nnz, 0, mat_.col); // Set entries of device object to zero ocl_set_to<ValueType>(CL_KERNEL_SET_TO, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], nnz, 0.0, mat_.val); this->nrow_ = nrow; this->ncol_ = ncol; this->nnz_ = nnz; } }
void OCLAcceleratorMatrixMCSR<ValueType>::AllocateMCSR(const int nnz, const int nrow, const int ncol) { assert(nnz >= 0); assert(ncol >= 0); assert(nrow >= 0); if (this->get_nnz() > 0) this->Clear(); if (nnz > 0) { allocate_ocl<int> (nrow+1, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.row_offset); allocate_ocl<int> (nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.col); allocate_ocl<ValueType>(nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.val); size_t localWorkSize[1]; size_t globalWorkSize[1]; localWorkSize[0] = this->local_backend_.OCL_max_work_group_size; globalWorkSize[0] = ( size_t( nrow+1 / localWorkSize[0] ) + 1 ) * localWorkSize[0]; ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], nrow+1, 0, mat_.row_offset); globalWorkSize[0] = ( size_t( nnz / localWorkSize[0] ) + 1 ) * localWorkSize[0]; ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], nnz, 0, mat_.col); ocl_set_to<ValueType>(CL_KERNEL_SET_TO, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], nnz, 0.0, mat_.val); this->nrow_ = nrow; this->ncol_ = ncol; this->nnz_ = nnz; } }
void OCLAcceleratorMatrixCOO<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar, BaseVector<ValueType> *out) const { // TODO some devices hang up while waiting for COO_SPMV_FLAT event to finish // this is a bug we are fixing in some future release if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out) ; assert(cast_in != NULL); assert(cast_out!= NULL); // do not support super small matrices assert(this->get_nnz() > OPENCL_WARPSIZE); // ---------------------------------------------------------- // Modified and adopted from CUSP 0.3.1, // http://code.google.com/p/cusp-library/ // NVIDIA, APACHE LICENSE 2.0 // ---------------------------------------------------------- // see __spmv_coo_flat(...) // ---------------------------------------------------------- // CHANGELOG // - adopted interface // ---------------------------------------------------------- //TODO //move in extra file - max_active_blocks, warp_size, block_size const unsigned int BLOCK_SIZE = this->local_backend_.OCL_max_work_group_size; // const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks; const unsigned int MAX_BLOCKS = 32; // cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0); const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / OPENCL_WARPSIZE; const unsigned int num_units = this->get_nnz() / OPENCL_WARPSIZE; const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS); const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity const unsigned int num_iters = (num_units + (num_warps-1)) / num_warps; const unsigned int interval_size = OPENCL_WARPSIZE * num_iters; const int tail = num_units * OPENCL_WARPSIZE; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_wrap elements) const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size); cl_mem *temp_rows = NULL; cl_mem *temp_vals = NULL; allocate_ocl<int> (active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows); allocate_ocl<ValueType>(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals); // LOG_INFO("active_warps = " << active_warps); // LOG_INFO("tail =" << tail); // LOG_INFO("interval_size =" << interval_size); // LOG_INFO("num_iters =" << num_iters); // LOG_INFO("num_blocks =" << num_blocks); // LOG_INFO("num_warps =" << num_warps); // LOG_INFO("num_units =" << num_units); // LOG_INFO("WARPS_PER_BLOCK =" << WARPS_PER_BLOCK); // LOG_INFO("MAX_BLOCKS =" << MAX_BLOCKS); // LOG_INFO("BLOCK_SIZE =" << BLOCK_SIZE); // LOG_INFO("WARP_SIZE =" << WARP_SIZE); // LOG_INFO("WARP_SIZE =" << this->local_backend_.GPU_wrap); cl_int err; cl_event ocl_event; size_t localWorkSize[1]; size_t globalWorkSize[1]; // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 0, sizeof(int), (void *) &tail ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 1, sizeof(int), (void *) &interval_size ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 2, sizeof(cl_mem), (void *) this->mat_.row ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 3, sizeof(cl_mem), (void *) this->mat_.col ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 4, sizeof(cl_mem), (void *) this->mat_.val ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 5, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 6, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 7, sizeof(cl_mem), (void *) cast_out->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 8, sizeof(cl_mem), (void *) temp_rows ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 9, sizeof(cl_mem), (void *) temp_vals ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine local work size for kernel call localWorkSize[0] = BLOCK_SIZE; // Determine global work size for kernel call globalWorkSize[0] = num_blocks * localWorkSize[0]; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_FLAT, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 0, sizeof(int), (void *) &active_warps ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, sizeof(cl_mem), (void *) temp_rows ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 2, sizeof(cl_mem), (void *) temp_vals ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 3, sizeof(cl_mem), (void *) cast_out->vec_ ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine global work size for kernel call globalWorkSize[0] = localWorkSize[0]; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); int nnz = this->get_nnz(); // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 0, sizeof(int), (void *) &nnz ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 1, sizeof(cl_mem), (void *) this->mat_.row ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 2, sizeof(cl_mem), (void *) this->mat_.col ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 3, sizeof(cl_mem), (void *) this->mat_.val ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 4, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 5, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 6, sizeof(cl_mem), (void *) cast_out->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 7, sizeof(int), (void *) &tail ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine local work size for kernel call localWorkSize[0] = 1; // Determine global work size for kernel call globalWorkSize[0] = 1; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_SERIAL, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); free_ocl(&temp_rows); free_ocl(&temp_vals); } }
void OCLAcceleratorMatrixHYB<ValueType>::AllocateHYB(const int ell_nnz, const int coo_nnz, const int ell_max_row, const int nrow, const int ncol) { assert( ell_nnz >= 0); assert( coo_nnz >= 0); assert( ell_max_row >= 0); assert( ncol >= 0); assert( nrow >= 0); if (this->get_nnz() > 0) this->Clear(); if (ell_nnz + coo_nnz > 0) { size_t localWorkSize[1]; size_t globalWorkSize[1]; localWorkSize[0] = this->local_backend_.OCL_max_work_group_size; globalWorkSize[0] = ( size_t( ell_nnz / localWorkSize[0] ) + 1 ) * localWorkSize[0]; // ELL assert(ell_nnz == ell_max_row*nrow); allocate_ocl<int> (ell_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.ELL.col); allocate_ocl<ValueType>(ell_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.ELL.val); ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], ell_nnz, 0, mat_.ELL.col); ocl_set_to<ValueType>(CL_KERNEL_SET_TO, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], ell_nnz, 0.0, mat_.ELL.val); this->mat_.ELL.max_row = ell_max_row; this->ell_nnz_ = ell_nnz; // COO globalWorkSize[0] = ( size_t( coo_nnz / localWorkSize[0] ) + 1 ) * localWorkSize[0]; allocate_ocl<int> (coo_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.COO.row); allocate_ocl<int> (coo_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.COO.col); allocate_ocl<ValueType>(coo_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.COO.val); ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], coo_nnz, 0, mat_.COO.row); ocl_set_to<int>(CL_KERNEL_SET_TO_INT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], coo_nnz, 0, mat_.COO.col); ocl_set_to<ValueType>(CL_KERNEL_SET_TO, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, localWorkSize[0], globalWorkSize[0], coo_nnz, 0.0, mat_.COO.val); this->nrow_ = nrow; this->ncol_ = ncol; this->nnz_ = ell_nnz + coo_nnz; } }
void OCLAcceleratorMatrixHYB<ValueType>::CopyFrom(const BaseMatrix<ValueType> &src) { const OCLAcceleratorMatrixHYB<ValueType> *ocl_cast_mat; const HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // OCL to OCL copy if ((ocl_cast_mat = dynamic_cast<const OCLAcceleratorMatrixHYB<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateHYB(ocl_cast_mat->get_ell_nnz(), ocl_cast_mat->get_coo_nnz(), ocl_cast_mat->get_ell_max_row(), ocl_cast_mat->get_nrow(), ocl_cast_mat->get_ncol()); assert((this->get_nnz() == src.get_nnz()) && (this->get_nrow() == src.get_nrow()) && (this->get_ncol() == src.get_ncol()) ); if (this->get_ell_nnz() > 0) { // ELL // must be within same opencl context ocl_dev2dev<int>(this->get_ell_nnz(), // size ocl_cast_mat->mat_.ELL.col, // src this->mat_.ELL.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_dev2dev<ValueType>(this->get_ell_nnz(), // size ocl_cast_mat->mat_.ELL.val, // src this->mat_.ELL.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); } if (this->get_coo_nnz() > 0) { // COO // must be within same opencl context ocl_dev2dev<int>(this->get_coo_nnz(), // size ocl_cast_mat->mat_.COO.row, // src this->mat_.COO.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_dev2dev<int>(this->get_coo_nnz(), // size ocl_cast_mat->mat_.COO.col, // src this->mat_.COO.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); ocl_dev2dev<ValueType>(this->get_coo_nnz(), // size ocl_cast_mat->mat_.COO.val, // src this->mat_.COO.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue ); } } else { //CPU to OCL if ((host_cast_mat = dynamic_cast<const HostMatrix<ValueType>*> (&src)) != NULL) { this->CopyFromHost(*host_cast_mat); } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } }