void OCLAcceleratorMatrixMCSR<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar,
                                                  BaseVector<ValueType> *out) const {

  if (this->get_nnz() > 0) {

    assert(in.  get_size() >= 0);
    assert(out->get_size() >= 0);
    assert(in.  get_size() == this->get_ncol());
    assert(out->get_size() == this->get_nrow());
    
    const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; 
    OCLAcceleratorVector<ValueType> *cast_out      = dynamic_cast<      OCLAcceleratorVector<ValueType>*> (out) ; 
    
    assert(cast_in != NULL);
    assert(cast_out!= NULL);    

    int nrow = this->get_nrow();

    cl_int    err;
    cl_event  ocl_event;
    size_t    localWorkSize[1];
    size_t    globalWorkSize[1];

    err  = clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 0, sizeof(int),       (void *) &nrow );
    err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 1, sizeof(cl_mem),    (void *) this->mat_.row_offset );
    err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 2, sizeof(cl_mem),    (void *) this->mat_.col );
    err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 3, sizeof(cl_mem),    (void *) this->mat_.val );
    err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 4, sizeof(ValueType), (void *) &scalar );
    err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 5, sizeof(cl_mem),    (void *) cast_in->vec_ );
    err |= clSetKernelArg( CL_KERNEL_MCSR_ADD_SPMV_SCALAR, 6, sizeof(cl_mem),    (void *) cast_out->vec_ );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    localWorkSize[0]  = this->local_backend_.OCL_max_work_group_size;
    localWorkSize[0] /= 2;
    globalWorkSize[0] = ( size_t( nrow / localWorkSize[0] ) + 1 ) * localWorkSize[0];

    err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                  CL_KERNEL_MCSR_ADD_SPMV_SCALAR,
                                  1,
                                  NULL,
                                  &globalWorkSize[0],
                                  &localWorkSize[0],
                                  0,
                                  NULL,
                                  &ocl_event);
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    err = clWaitForEvents( 1, &ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    err = clReleaseEvent( ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

  }

}
void OCLAcceleratorMatrixBCSR<ValueType>::Apply(const BaseVector<ValueType> &in, BaseVector<ValueType> *out) const {
/*
  assert(in.  get_size() >= 0);
  assert(out->get_size() >= 0);
  assert(in.  get_size() == this->get_ncol());
  assert(out->get_size() == this->get_nrow());


  const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; 
  OCLAcceleratorVector<ValueType> *cast_out      = dynamic_cast<      OCLAcceleratorVector<ValueType>*> (out) ; 

  assert(cast_in != NULL);
  assert(cast_out!= NULL);
*/
  FATAL_ERROR(__FILE__, __LINE__); 

  // to avoid compiler warnings
  int err;
  CHECK_OCL_ERROR(err, __FILE__, __LINE__);   

}
Ejemplo n.º 3
0
void OCLAcceleratorMatrixCOO<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar,
                                                        BaseVector<ValueType> *out) const {

  // TODO some devices hang up while waiting for COO_SPMV_FLAT event to finish
  // this is a bug we are fixing in some future release
  if (this->get_nnz() > 0) {

    assert(in.  get_size() >= 0);
    assert(out->get_size() >= 0);
    assert(in.  get_size() == this->get_ncol());
    assert(out->get_size() == this->get_nrow());
    
    
    const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; 
    OCLAcceleratorVector<ValueType> *cast_out      = dynamic_cast<      OCLAcceleratorVector<ValueType>*> (out) ; 
    
    assert(cast_in != NULL);
    assert(cast_out!= NULL);

    // do not support super small matrices
    assert(this->get_nnz() > OPENCL_WARPSIZE); 

    // ----------------------------------------------------------
    // Modified and adopted from CUSP 0.3.1, 
    // http://code.google.com/p/cusp-library/
    // NVIDIA, APACHE LICENSE 2.0
    // ----------------------------------------------------------
    // see __spmv_coo_flat(...)
    // ----------------------------------------------------------
    // CHANGELOG
    // - adopted interface
    // ----------------------------------------------------------  

    //TODO
    //move in extra file -  max_active_blocks, warp_size, block_size

    const unsigned int BLOCK_SIZE = this->local_backend_.OCL_max_work_group_size;
    //    const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks;

    const unsigned int MAX_BLOCKS = 32; //  cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0);

    const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / OPENCL_WARPSIZE;


    const unsigned int num_units  = this->get_nnz() / OPENCL_WARPSIZE; 
    const unsigned int num_warps  = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS);
    const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity
    const unsigned int num_iters  = (num_units +  (num_warps-1)) / num_warps;
    
    const unsigned int interval_size = OPENCL_WARPSIZE * num_iters;

    const int tail = num_units * OPENCL_WARPSIZE; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_wrap elements)

    const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size);

    cl_mem *temp_rows = NULL;
    cl_mem *temp_vals = NULL;

    allocate_ocl<int>      (active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows);
    allocate_ocl<ValueType>(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals);

    //  LOG_INFO("active_warps = " << active_warps);
    //  LOG_INFO("tail =" << tail);
    //  LOG_INFO("interval_size =" << interval_size);
    //  LOG_INFO("num_iters =" << num_iters);
    //  LOG_INFO("num_blocks =" << num_blocks);
    //  LOG_INFO("num_warps =" << num_warps);
    //  LOG_INFO("num_units =" << num_units);
    //  LOG_INFO("WARPS_PER_BLOCK =" << WARPS_PER_BLOCK);
    //  LOG_INFO("MAX_BLOCKS =" << MAX_BLOCKS);
    //  LOG_INFO("BLOCK_SIZE =" << BLOCK_SIZE);
    //  LOG_INFO("WARP_SIZE =" << WARP_SIZE);
    //  LOG_INFO("WARP_SIZE =" << this->local_backend_.GPU_wrap);

    cl_int    err;
    cl_event  ocl_event;
    size_t    localWorkSize[1];
    size_t    globalWorkSize[1];

    // Set arguments for kernel call
    err  = clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 0, sizeof(int),       (void *) &tail );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 1, sizeof(int),       (void *) &interval_size );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 2, sizeof(cl_mem),    (void *) this->mat_.row );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 3, sizeof(cl_mem),    (void *) this->mat_.col );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 4, sizeof(cl_mem),    (void *) this->mat_.val );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 5, sizeof(ValueType), (void *) &scalar );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 6, sizeof(cl_mem),    (void *) cast_in->vec_ );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 7, sizeof(cl_mem),    (void *) cast_out->vec_ );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 8, sizeof(cl_mem),    (void *) temp_rows );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 9, sizeof(cl_mem),    (void *) temp_vals );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Determine local work size for kernel call
    localWorkSize[0]  = BLOCK_SIZE;
    // Determine global work size for kernel call
    globalWorkSize[0] = num_blocks * localWorkSize[0];

    // Start kernel run
    err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                  CL_KERNEL_COO_SPMV_FLAT,
                                  1,
                                  NULL,
                                  &globalWorkSize[0],
                                  &localWorkSize[0],
                                  0,
                                  NULL,
                                  &ocl_event);
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Wait for kernel run to finish
    err = clWaitForEvents( 1, &ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Release event when kernel run finished
    err = clReleaseEvent( ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Set arguments for kernel call
    err  = clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 0, sizeof(int),    (void *) &active_warps );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, sizeof(cl_mem), (void *) temp_rows );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 2, sizeof(cl_mem), (void *) temp_vals );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 3, sizeof(cl_mem), (void *) cast_out->vec_ );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Determine global work size for kernel call
    globalWorkSize[0] = localWorkSize[0];

    // Start kernel run
    err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                  CL_KERNEL_COO_SPMV_REDUCE_UPDATE,
                                  1,
                                  NULL,
                                  &globalWorkSize[0],
                                  &localWorkSize[0],
                                  0,
                                  NULL,
                                  &ocl_event);
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Wait for kernel run to finish
    err = clWaitForEvents( 1, &ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Release event when kernel run finished
    err = clReleaseEvent( ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    int nnz = this->get_nnz();

    // Set arguments for kernel call
    err  = clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 0, sizeof(int),       (void *) &nnz );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 1, sizeof(cl_mem),    (void *) this->mat_.row );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 2, sizeof(cl_mem),    (void *) this->mat_.col );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 3, sizeof(cl_mem),    (void *) this->mat_.val );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 4, sizeof(ValueType), (void *) &scalar );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 5, sizeof(cl_mem),    (void *) cast_in->vec_ );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 6, sizeof(cl_mem),    (void *) cast_out->vec_ );
    err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 7, sizeof(int),       (void *) &tail );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Determine local work size for kernel call
    localWorkSize[0]  = 1;
    // Determine global work size for kernel call
    globalWorkSize[0] = 1;

    // Start kernel run
    err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue,
                                  CL_KERNEL_COO_SPMV_SERIAL,
                                  1,
                                  NULL,
                                  &globalWorkSize[0],
                                  &localWorkSize[0],
                                  0,
                                  NULL,
                                  &ocl_event);
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Wait for kernel run to finish
    err = clWaitForEvents( 1, &ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ );

    // Release event when kernel run finished
    err = clReleaseEvent( ocl_event );
    CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); 
    
    free_ocl(&temp_rows);
    free_ocl(&temp_vals);

  }

}