예제 #1
0
void generate_scalar_swap(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void swap( \n");
  source.append("          __global "); source.append(numeric_string); source.append(" * s1, \n");
  source.append("          __global "); source.append(numeric_string); source.append(" * s2) \n");
  source.append("{ \n");
  source.append("  "); source.append(numeric_string); source.append(" tmp = *s2; \n");
  source.append("  *s2 = *s1; \n");
  source.append("  *s1 = tmp; \n");
  source.append("} \n");
}
예제 #2
0
void generate_fft_complex_to_real(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void complex_to_real(__global "); source.append(numeric_string); source.append("2 *in, \n");
  source.append("  __global "); source.append(numeric_string); source.append("  *out, \n");
  source.append("  unsigned int size) { \n");
  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
  source.append("    out[i] = in[i].x; \n");
  source.append("} \n");
}
예제 #3
0
void generate_fft_div_vec_scalar(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append("2 *input1, \n");
  source.append("  unsigned int size, \n");
  source.append("  "); source.append(numeric_string); source.append(" factor) { \n");
  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
  source.append("    input1[i] /= factor; \n");
  source.append("} \n");
}
예제 #4
0
void generate_asbs_impl3(StringT & source, char sign_a, char sign_b, asbs_config const & cfg, bool mult_alpha, bool mult_beta)
{
  source.append("      *s1 "); source.append(cfg.assign_op); source.append(1, sign_a); source.append(" *s2 ");
  if (mult_alpha)
    source.append("* alpha ");
  else
    source.append("/ alpha ");
  if (cfg.b != VIENNACL_ASBS_NONE)
  {
    source.append(1, sign_b); source.append(" *s3 ");
    if (mult_beta)
      source.append("* beta");
    else
      source.append("/ beta");
  }
  source.append("; \n");
}
예제 #5
0
void generate_asbs_impl2(StringT & source, char sign_a, char sign_b, asbs_config const & cfg)
{
  source.append("    if (options2 & (1 << 1)) { \n");
  if (cfg.b != VIENNACL_ASBS_NONE)
  {
    source.append("     if (options3 & (1 << 1)) \n");
    generate_asbs_impl3(source, sign_a, sign_b, cfg, false, false);
    source.append("     else \n");
    generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
  }
  else
    generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
  source.append("    } else { \n");
  if (cfg.b != VIENNACL_ASBS_NONE)
  {
    source.append("     if (options3 & (1 << 1)) \n");
    generate_asbs_impl3(source, sign_a, sign_b, cfg, true, false);
    source.append("     else \n");
    generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
  }
  else
    generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
  source.append("    } \n");

}
예제 #6
0
void generate_amg_pmis2_reset_state(StringT & source)
{

 source.append("__kernel void amg_pmis2_reset_state( \n");
 source.append("  __global unsigned int *point_types, \n");
 source.append("  unsigned int size) { \n");

 source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 source.append("    if (point_types[i] != 1) point_types[i] = 0;\n"); // mind mapping of POINT_TYPE_COARSE and POINT_TYPE_UNDECIDED
 source.append("  } \n");

 source.append("} \n");
}
예제 #7
0
void generate_amg_agg_merge_undecided_2(StringT & source)
{

 source.append(" __kernel void amg_agg_merge_undecided_2( \n");
 source.append("  __global unsigned int *point_types, \n");
 source.append("  unsigned int size) { \n");

 source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
 source.append("    if (point_types[i] == 0) point_types[i] = 2; \n"); // POINT_TYPE_UNDECIDED to POINT_TYPE_FINE

 source.append("} \n");
}
예제 #8
0
void generate_fft_real_to_complex(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(" *in, \n");
  source.append("  __global "); source.append(numeric_string); source.append("2 *out, \n");
  source.append("  unsigned int size) { \n");
  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
  source.append("    "); source.append(numeric_string); source.append("2 val = 0; \n");
  source.append("    val.x = in[i]; \n");
  source.append("    out[i] = val; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #9
0
void generate_fft_mult_vec(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append("2 *input1, \n");
  source.append("  __global const "); source.append(numeric_string); source.append("2 *input2, \n");
  source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
  source.append("  unsigned int size) { \n");
  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
  source.append("    "); source.append(numeric_string); source.append("2 in1 = input1[i]; \n");
  source.append("    "); source.append(numeric_string); source.append("2 in2 = input2[i]; \n");

  source.append("    output[i] = ("); source.append(numeric_string); source.append("2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #10
0
void generate_amg_pmis2_init_workdata(StringT & source)
{

 source.append("__kernel void amg_pmis2_init_workdata( \n");
 source.append("  __global unsigned int       *work_state, \n");
 source.append("  __global unsigned int       *work_random, \n");
 source.append("  __global unsigned int       *work_index, \n");
 source.append("  __global unsigned int const *point_types, \n");
 source.append("  __global unsigned int const *random_weights, \n");
 source.append("  unsigned int size) { \n");

 source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
 source.append("    switch (point_types[i]) { \n");
 source.append("    case 0:  work_state[i] = 1; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
 source.append("    case 1:  work_state[i] = 2; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
 source.append("    case 2:  work_state[i] = 0; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE

 source.append("    default: break; // do nothing \n");
 source.append("    } \n");

 source.append("    work_random[i] = random_weights[i]; \n");
 source.append("    work_index[i]  = i; \n");
 source.append("  } \n");
 source.append("} \n");
}
예제 #11
0
void generate_ilu_scale_kernel_2(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void ilu_scale_kernel_2( \n");
  source.append("  __global unsigned int const *R_row_indices, \n");
  source.append("  __global unsigned int const *R_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append("       *R_elements, \n");
  source.append("  unsigned int R_size1, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" const *D_elements) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < R_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    unsigned int row_begin = R_row_indices[row]; \n");
  source.append("    unsigned int row_end   = R_row_indices[row+1]; \n");

  source.append("    "); source.append(numeric_string); source.append(" D_row = D_elements[row]; \n");
  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) \n");
  source.append("      R_elements[j] *= D_row * D_elements[R_col_indices[j]]; \n");

  source.append("  } \n");
  source.append("} \n");
}
예제 #12
0
void generate_hyb_vec_mul(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void vec_mul( \n");
  source.append("  const __global int* ell_coords, \n");
  source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
  source.append("  const __global uint* csr_rows, \n");
  source.append("  const __global uint* csr_cols, \n");
  source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
  source.append("  const __global "); source.append(numeric_string); source.append(" * x, \n");
  source.append("  uint4 layout_x, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
  source.append("  uint4 layout_result, \n");
  source.append("  unsigned int row_num, \n");
  source.append("  unsigned int internal_row_num, \n");
  source.append("  unsigned int items_per_row, \n");
  source.append("  unsigned int aligned_items_per_row) \n");
  source.append("{ \n");
  source.append("  uint glb_id = get_global_id(0); \n");
  source.append("  uint glb_sz = get_global_size(0); \n");

  source.append("  for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");

  source.append("    uint offset = row_id; \n");
  source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
  source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");

  source.append("      if (val != ("); source.append(numeric_string); source.append(")0) { \n");
  source.append("        int col = ell_coords[offset]; \n");
  source.append("        sum += (x[col * layout_x.y + layout_x.x] * val); \n");
  source.append("      } \n");

  source.append("    } \n");

  source.append("    uint col_begin = csr_rows[row_id]; \n");
  source.append("    uint col_end   = csr_rows[row_id + 1]; \n");

  source.append("    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
  source.append("      sum += (x[csr_cols[item_id] * layout_x.y + layout_x.x] * csr_elements[item_id]); \n");
  source.append("    } \n");

  source.append("    result[row_id * layout_result.y + layout_result.x] = sum; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #13
0
void generate_fft_bluestein_post(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void bluestein_post(__global "); source.append(numeric_string); source.append("2 *Z, \n");
  source.append("                             __global "); source.append(numeric_string); source.append("2 *out, \n");
  source.append("                             unsigned int size) \n");
  source.append("{ \n");
  source.append("  unsigned int glb_id = get_global_id(0); \n");
  source.append("  unsigned int glb_sz = get_global_size(0); \n");

  source.append("  unsigned int double_size = size << 1; \n");
  source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
  source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");

  source.append("  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
  source.append("    unsigned int rm = i * i % (double_size); \n");
  source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * (-NUM_PI); \n");

  source.append("    sn_a = sincos(angle, &cs_a); \n");

  source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
  source.append("    out[i] = ("); source.append(numeric_string); source.append("2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #14
0
void generate_fft_vandermonde_prod(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(" *vander, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" *vector, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" *result, \n");
  source.append("  uint size) { \n");
  source.append("  for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
  source.append("    "); source.append(numeric_string); source.append(" mul = vander[i]; \n");
  source.append("    "); source.append(numeric_string); source.append(" pwr = 1; \n");
  source.append("    "); source.append(numeric_string); source.append(" val = 0; \n");

  source.append("    for (uint j = 0; j < size; j++) { \n");
  source.append("      val = val + pwr * vector[j]; \n");
  source.append("      pwr *= mul; \n");
  source.append("    } \n");

  source.append("    result[i] = val; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #15
0
void generate_asbs_impl(StringT & source, std::string const & numeric_string, asbs_config const & cfg)
{
  source.append("__kernel void as");
  if (cfg.b != VIENNACL_ASBS_NONE)
    source.append("bs");
  if (cfg.assign_op != "=")
    source.append("_s");

  if (cfg.a == VIENNACL_ASBS_CPU)
    source.append("_cpu");
  else if (cfg.a == VIENNACL_ASBS_GPU)
    source.append("_gpu");

  if (cfg.b == VIENNACL_ASBS_CPU)
    source.append("_cpu");
  else if (cfg.b == VIENNACL_ASBS_GPU)
    source.append("_gpu");
  source.append("( \n");
  source.append("  __global "); source.append(numeric_string); source.append(" * s1, \n");
  source.append(" \n");
  if (cfg.a == VIENNACL_ASBS_CPU)
  {
    source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
  }
  else if (cfg.a == VIENNACL_ASBS_GPU)
  {
    source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
  }
  source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
  source.append("  __global const "); source.append(numeric_string); source.append(" * s2");

  if (cfg.b != VIENNACL_ASBS_NONE)
  {
    source.append(", \n\n");
    if (cfg.b == VIENNACL_ASBS_CPU)
    {
      source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
    }
    else if (cfg.b == VIENNACL_ASBS_GPU)
    {
      source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
    }
    source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
    source.append("  __global const "); source.append(numeric_string); source.append(" * s3");
  }
  source.append(") \n{ \n");

  if (cfg.a == VIENNACL_ASBS_CPU)
  {
    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
  }
  else if (cfg.a == VIENNACL_ASBS_GPU)
  {
    source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
  }
  source.append(" \n");

  if (cfg.b == VIENNACL_ASBS_CPU)
  {
    source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
  }
  else if (cfg.b == VIENNACL_ASBS_GPU)
  {
    source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
  }

  source.append("  if (options2 & (1 << 0)) { \n");
  if (cfg.b != VIENNACL_ASBS_NONE)
  {
    source.append("   if (options3 & (1 << 0)) { \n");
    generate_asbs_impl2(source, '-', '-', cfg);
    source.append("   } else { \n");
    generate_asbs_impl2(source, '-', '+', cfg);
    source.append("   } \n");
  }
  else
    generate_asbs_impl2(source, '-', '+', cfg);
  source.append("  } else { \n");
  if (cfg.b != VIENNACL_ASBS_NONE)
  {
    source.append("   if (options3 & (1 << 0)) { \n");
    generate_asbs_impl2(source, '+', '-', cfg);
    source.append("   } else { \n");
    generate_asbs_impl2(source, '+', '+', cfg);
    source.append("   } \n");
  }
  else
    generate_asbs_impl2(source, '+', '+', cfg);

  source.append("  } \n");
  source.append("} \n");
}
예제 #16
0
void generate_sliced_ell_vec_mul(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void vec_mul( \n");
  source.append("  __global const unsigned int * columns_per_block, \n");
  source.append("  __global const unsigned int * column_indices, \n");
  source.append("  __global const unsigned int * block_start, \n");
  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
  source.append("  uint4 layout_x, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
  source.append("  uint4 layout_result) \n");
  source.append("{ \n");
  source.append("  uint local_id   = get_local_id(0); \n");
  source.append("  uint local_size = get_local_size(0); \n");
  source.append("  uint num_rows   = layout_result.z; \n");

  source.append("  for (uint block_idx = get_group_id(0); block_idx <= num_rows / local_size; block_idx += get_num_groups(0)) { \n");
  source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");

  source.append("    uint row    = block_idx * local_size + local_id; \n");
  source.append("    uint offset = block_start[block_idx]; \n");
  source.append("    uint num_columns = columns_per_block[block_idx]; \n");
  source.append("    for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
  source.append("      uint index = offset + item_id * local_size + local_id; \n");
  source.append("      "); source.append(numeric_string); source.append(" val = elements[index]; \n");
  source.append("      sum += val ? (x[column_indices[index] * layout_x.y + layout_x.x] * val) : 0; \n");
  source.append("    } \n");

  source.append("    if (row < num_rows) \n");
  source.append("      result[row * layout_result.y + layout_result.x] = sum; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #17
0
void generate_icc_extract_L_2(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void extract_L_2( \n");
  source.append("  __global unsigned int const *A_row_indices, \n");
  source.append("  __global unsigned int const *A_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
  source.append("  unsigned int A_size1, \n");
  source.append("  __global unsigned int const *L_row_indices, \n");
  source.append("  __global unsigned int       *L_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" *L_elements) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < A_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");

  source.append("    unsigned int index_L = L_row_indices[row]; \n");
  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
  source.append("      unsigned int col = A_col_indices[j]; \n");
  source.append("      "); source.append(numeric_string); source.append(" value = A_elements[j]; \n");

  source.append("      if (col <= row) { \n");
  source.append("        L_col_indices[index_L] = col; \n");
  source.append("        L_elements[index_L]    = value; \n");
  source.append("        ++index_L; \n");
  source.append("      } \n");
  source.append("    } \n");

  source.append("  } \n");
  source.append("} \n");
}
예제 #18
0
void generate_icc_extract_L_1(StringT & source)
{
  source.append("__kernel void extract_L_1( \n");
  source.append("  __global unsigned int const *A_row_indices, \n");
  source.append("  __global unsigned int const *A_col_indices, \n");
  source.append("  unsigned int A_size1, \n");
  source.append("  __global unsigned int *L_row_indices) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < A_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");

  source.append("    unsigned int num_entries_L = 0; \n");
  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
  source.append("      unsigned int col = A_col_indices[j]; \n");
  source.append("      if (col <= row) ++num_entries_L; \n");
  source.append("    } \n");

  source.append("    L_row_indices[row] = num_entries_L;   \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #19
0
void generate_ilu_form_neumann_matrix_kernel(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void ilu_form_neumann_matrix_kernel( \n");
  source.append("  __global unsigned int const *R_row_indices, \n");
  source.append("  __global unsigned int const *R_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" *R_elements, \n");
  source.append("  unsigned int R_size1, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" *D_elements) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < R_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    unsigned int row_begin = R_row_indices[row]; \n");
  source.append("    unsigned int row_end   = R_row_indices[row+1]; \n");

  // Part 1: Extract and set diagonal entry
  source.append("    "); source.append(numeric_string); source.append(" diag = D_elements[row]; \n");
  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
  source.append("      unsigned int col = R_col_indices[j]; \n");
  source.append("      if (col == row) { \n");
  source.append("        diag = R_elements[j]; \n");
  source.append("        R_elements[j] = 0; \n");
  source.append("        break; \n");
  source.append("      } \n");
  source.append("    } \n");
  source.append("    D_elements[row] = diag; \n");

  // Part 2: Scale
  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) \n");
  source.append("      R_elements[j] /= -diag; \n");

  source.append("  } \n");
  source.append("} \n");
}
예제 #20
0
void generate_ilu_level_scheduling_substitute(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void level_scheduling_substitute( \n");
  source.append("  __global const unsigned int * row_index_array, \n");
  source.append("  __global const unsigned int * row_indices, \n");
  source.append("  __global const unsigned int * column_indices, \n");
  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" * vec, \n");
  source.append("  unsigned int size) \n");
  source.append("{ \n");
  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < size; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    unsigned int eq_row = row_index_array[row]; \n");
  source.append("    "); source.append(numeric_string); source.append(" vec_entry = vec[eq_row]; \n");
  source.append("    unsigned int row_end = row_indices[row+1]; \n");

  source.append("    for (unsigned int j = row_indices[row]; j < row_end; ++j) \n");
  source.append("      vec_entry -= vec[column_indices[j]] * elements[j]; \n");

  source.append("    vec[eq_row] = vec_entry; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #21
0
void generate_ilu_chow_patel_sweep_kernel(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void ilu_chow_patel_sweep_kernel( \n");
  source.append("  __global unsigned int const *L_row_indices, \n");
  source.append("  __global unsigned int const *L_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append("       *L_elements, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" const *L_backup, \n");
  source.append("  unsigned int L_size1, \n");

  source.append("  __global "); source.append(numeric_string); source.append(" const *aij_L, \n");

  source.append("  __global unsigned int const *U_trans_row_indices, \n");
  source.append("  __global unsigned int const *U_trans_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append("       *U_trans_elements, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" const *U_trans_backup, \n");

  source.append("  __global "); source.append(numeric_string); source.append(" const *aij_U_trans) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < L_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");

  //
  // Update L:
  //
  source.append("    unsigned int row_L_start = L_row_indices[row]; \n");
  source.append("    unsigned int row_L_end   = L_row_indices[row + 1]; \n");

  source.append("    for (unsigned int j = row_L_start; j < row_L_end; ++j) { \n");
  source.append("      unsigned int col = L_col_indices[j]; \n");

  source.append("      if (col == row) continue; \n");

  source.append("      unsigned int row_U_start = U_trans_row_indices[col]; \n");
  source.append("      unsigned int row_U_end   = U_trans_row_indices[col + 1]; \n");

  source.append("      unsigned int index_U = row_U_start; \n");
  source.append("      unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; \n");

  source.append("      "); source.append(numeric_string); source.append(" sum = 0; \n");
  source.append("      for (unsigned int k = row_L_start; k < j; ++k) { \n");
  source.append("        unsigned int col_L = L_col_indices[k]; \n");

  source.append("        while (col_U < col_L) { \n");
  source.append("          ++index_U; \n");
  source.append("          col_U = U_trans_col_indices[index_U]; \n");
  source.append("        } \n");

  source.append("        if (col_U == col_L) \n");
  source.append("          sum += L_backup[k] * U_trans_backup[index_U]; \n");
  source.append("      } \n");

  // update l_ij:
  source.append("      L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; \n");
  source.append("    } \n");

  //
  // Update U:
  //
  source.append("    unsigned int row_U_start = U_trans_row_indices[row]; \n");
  source.append("    unsigned int row_U_end   = U_trans_row_indices[row + 1]; \n");

  source.append("    for (unsigned int j = row_U_start; j < row_U_end; ++j) { \n");
  source.append("      unsigned int col = U_trans_col_indices[j]; \n");

  source.append("      row_L_start = L_row_indices[col]; \n");
  source.append("      row_L_end   = L_row_indices[col + 1]; \n");

  // compute \sum_{k=1}^{j-1} l_ik u_kj
  source.append("      unsigned int index_L = row_L_start; \n");
  source.append("      unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; \n");
  source.append("      "); source.append(numeric_string); source.append(" sum = 0; \n");
  source.append("      for (unsigned int k = row_U_start; k < j; ++k) { \n");
  source.append("        unsigned int col_U = U_trans_col_indices[k]; \n");

  // find element in L:
  source.append("        while (col_L < col_U) { \n");
  source.append("          ++index_L; \n");
  source.append("          col_L = L_col_indices[index_L]; \n");
  source.append("        } \n");

  source.append("        if (col_U == col_L) \n");
  source.append("          sum += L_backup[index_L] * U_trans_backup[k]; \n");
  source.append("      } \n");

  // update U_ij:
  source.append("      U_trans_elements[j] = aij_U_trans[j] - sum; \n");
  source.append("    } \n");

  source.append("  } \n");
  source.append("} \n");
}
예제 #22
0
void generate_fft_reverse_inplace(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(" *vec, uint size) { \n");
  source.append("  for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
  source.append("    "); source.append(numeric_string); source.append(" val1 = vec[i]; \n");
  source.append("    "); source.append(numeric_string); source.append(" val2 = vec[size - i - 1]; \n");

  source.append("    vec[i] = val2; \n");
  source.append("    vec[size - i - 1] = val1; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #23
0
void generate_fft_transpose_inplace(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("2* input, \n");
  source.append("  unsigned int row_num, \n");
  source.append("  unsigned int col_num) { \n");
  source.append("  unsigned int size = row_num * col_num; \n");
  source.append("  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
  source.append("    unsigned int row = i / col_num; \n");
  source.append("    unsigned int col = i - row*col_num; \n");

  source.append("    unsigned int new_pos = col * row_num + row; \n");

  source.append("    if (i < new_pos) { \n");
  source.append("      "); source.append(numeric_string); source.append("2 val = input[i]; \n");
  source.append("      input[i] = input[new_pos]; \n");
  source.append("      input[new_pos] = val; \n");
  source.append("    } \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #24
0
void generate_icc_chow_patel_sweep_kernel(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void icc_chow_patel_sweep_kernel( \n");
  source.append("  __global unsigned int const *L_row_indices, \n");
  source.append("  __global unsigned int const *L_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append("       *L_elements, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" const *L_backup, \n");
  source.append("  unsigned int L_size1, \n");

  source.append("  __global "); source.append(numeric_string); source.append(" const *aij_L) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < L_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");

  //
  // Update L:
  //
  source.append("    unsigned int row_Li_start = L_row_indices[row]; \n");
  source.append("    unsigned int row_Li_end   = L_row_indices[row + 1]; \n");

  source.append("    for (unsigned int i = row_Li_start; i < row_Li_end; ++i) { \n");
  source.append("      unsigned int col = L_col_indices[i]; \n");

  source.append("      unsigned int row_Lj_start = L_row_indices[col]; \n");
  source.append("      unsigned int row_Lj_end   = L_row_indices[col + 1]; \n");

  source.append("      unsigned int index_Lj = row_Lj_start; \n");
  source.append("      unsigned int col_Lj = L_col_indices[index_Lj]; \n");

  source.append("      "); source.append(numeric_string); source.append(" s = aij_L[i]; \n");
  source.append("      for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) { \n");
  source.append("        unsigned int col_Li = L_col_indices[index_Li]; \n");

  source.append("        while (col_Lj < col_Li) { \n");
  source.append("          ++index_Lj; \n");
  source.append("          col_Lj = L_col_indices[index_Lj]; \n");
  source.append("        } \n");

  source.append("        if (col_Lj == col_Li) \n");
  source.append("          s -= L_backup[index_Li] * L_backup[index_Lj]; \n");
  source.append("      } \n");

  // update l_ij:
  source.append("      L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); \n");
  source.append("    } \n");

  source.append("  } \n");
  source.append("} \n");
}
예제 #25
0
void generate_fft_zero2(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void zero2(__global "); source.append(numeric_string); source.append("2 *input1, \n");
  source.append("  __global "); source.append(numeric_string); source.append("2 *input2, \n");
  source.append("  unsigned int size) { \n");
  source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
  source.append("    input1[i] = 0; \n");
  source.append("    input2[i] = 0; \n");
  source.append("  } \n");
  source.append("} \n");
}
void generate_vec_mul(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void vec_mul( \n");
  source.append("  __global const unsigned int * row_jumper, \n");
  source.append("  __global const unsigned int * row_indices, \n");
  source.append("  __global const unsigned int * column_indices, \n");
  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
  source.append("  uint nonzero_rows, \n");
  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
  source.append("  uint4 layout_x, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
  source.append("  uint4 layout_result) \n");
  source.append("{ \n");
  source.append("  for (unsigned int i = get_global_id(0); i < nonzero_rows; i += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
  source.append("    unsigned int row_end = row_jumper[i+1]; \n");
  source.append("    for (unsigned int j = row_jumper[i]; j < row_end; ++j) \n");
  source.append("      dot_prod += elements[j] * x[column_indices[j] * layout_x.y + layout_x.x]; \n");
  source.append("    result[row_indices[i] * layout_result.y + layout_result.x] = dot_prod; \n");
  source.append("  } \n");
  source.append(" } \n");
}
예제 #27
0
void generate_fft_bluestein_pre(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append("2 *input, \n");
  source.append("  __global "); source.append(numeric_string); source.append("2 *A, \n");
  source.append("  __global "); source.append(numeric_string); source.append("2 *B, \n");
  source.append("  unsigned int size, \n");
  source.append("  unsigned int ext_size \n");
  source.append("  ) { \n");
  source.append("  unsigned int glb_id = get_global_id(0); \n");
  source.append("  unsigned int glb_sz = get_global_size(0); \n");

  source.append("  unsigned int double_size = size << 1; \n");

  source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
  source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");

  source.append("  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
  source.append("    unsigned int rm = i * i % (double_size); \n");
  source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * NUM_PI; \n");

  source.append("    sn_a = sincos(-angle, &cs_a); \n");

  source.append("    "); source.append(numeric_string); source.append("2 a_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
  source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, -sn_a); \n");

  source.append("    A[i] = ("); source.append(numeric_string); source.append("2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
  source.append("    B[i] = b_i; \n");

          // very bad instruction, to be fixed
  source.append("    if (i) \n");
  source.append("      B[ext_size - i] = b_i; \n");
  source.append("  } \n");
  source.append("} \n");
}
예제 #28
0
void generate_amg_pmis2_max_neighborhood(StringT & source)
{

 source.append("__kernel void amg_pmis2_max_neighborhood( \n");
 source.append("  __global unsigned int       *work_state, \n");
 source.append("  __global unsigned int       *work_random, \n");
 source.append("  __global unsigned int       *work_index, \n");
 source.append("  __global unsigned int       *work_state2, \n");
 source.append("  __global unsigned int       *work_random2, \n");
 source.append("  __global unsigned int       *work_index2, \n");
 source.append("  __global unsigned int const *influences_row, \n");
 source.append("  __global unsigned int const *influences_id, \n");
 source.append("  unsigned int size) { \n");

 source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");

 // load
 source.append("    unsigned int state  = work_state[i]; \n");
 source.append("    unsigned int random = work_random[i]; \n");
 source.append("    unsigned int index  = work_index[i]; \n");

 // max
 source.append("    unsigned int j_stop = influences_row[i + 1]; \n");
 source.append("    for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
 source.append("      unsigned int influenced_point_id = influences_id[j]; \n");

 // lexigraphical triple-max (not particularly pretty, but does the job):
 source.append("      if (state < work_state[influenced_point_id]) { \n");
 source.append("        state  = work_state[influenced_point_id]; \n");
 source.append("        random = work_random[influenced_point_id]; \n");
 source.append("        index  = work_index[influenced_point_id]; \n");
 source.append("      } else if (state == work_state[influenced_point_id]) { \n");
 source.append("        if (random < work_random[influenced_point_id]) { \n");
 source.append("          state  = work_state[influenced_point_id]; \n");
 source.append("          random = work_random[influenced_point_id]; \n");
 source.append("          index  = work_index[influenced_point_id]; \n");
 source.append("        } else if (random == work_random[influenced_point_id]) { \n");
 source.append("          if (index < work_index[influenced_point_id]) { \n");
 source.append("            state  = work_state[influenced_point_id]; \n");
 source.append("            random = work_random[influenced_point_id]; \n");
 source.append("            index  = work_index[influenced_point_id]; \n");
 source.append("          } \n");
 source.append("        } \n");
 source.append("      } \n");

 source.append("    }\n"); //for

 // store
 source.append("    work_state2[i]  = state; \n");
 source.append("    work_random2[i] = random; \n");
 source.append("    work_index2[i]  = index; \n");
 source.append("  } \n");
 source.append("} \n");
}
예제 #29
0
  void generate_hyb_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
                                            bool B_transposed, bool B_row_major, bool C_row_major)
  {
    source.append("__kernel void ");
    source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
    source.append("( \n");
    source.append("  const __global int* ell_coords, \n");
    source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
    source.append("  const __global uint* csr_rows, \n");
    source.append("  const __global uint* csr_cols, \n");
    source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
    source.append("  unsigned int row_num, \n");
    source.append("  unsigned int internal_row_num, \n");
    source.append("  unsigned int items_per_row, \n");
    source.append("  unsigned int aligned_items_per_row, \n");
    source.append("    __global const "); source.append(numeric_string); source.append("* d_mat, \n");
    source.append("    unsigned int d_mat_row_start, \n");
    source.append("    unsigned int d_mat_col_start, \n");
    source.append("    unsigned int d_mat_row_inc, \n");
    source.append("    unsigned int d_mat_col_inc, \n");
    source.append("    unsigned int d_mat_row_size, \n");
    source.append("    unsigned int d_mat_col_size, \n");
    source.append("    unsigned int d_mat_internal_rows, \n");
    source.append("    unsigned int d_mat_internal_cols, \n");
    source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
    source.append("    unsigned int result_row_start, \n");
    source.append("    unsigned int result_col_start, \n");
    source.append("    unsigned int result_row_inc, \n");
    source.append("    unsigned int result_col_inc, \n");
    source.append("    unsigned int result_row_size, \n");
    source.append("    unsigned int result_col_size, \n");
    source.append("    unsigned int result_internal_rows, \n");
    source.append("    unsigned int result_internal_cols) { \n");

    source.append("  uint glb_id = get_global_id(0); \n");
    source.append("  uint glb_sz = get_global_size(0); \n");

    source.append("  for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
    source.append("   for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
    source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");

    source.append("    uint offset = row_id; \n");
    source.append("    for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
    source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");

    source.append("      if (val != ("); source.append(numeric_string); source.append(")0) { \n");
    source.append("        int col = ell_coords[offset]; \n");
    if (B_transposed && B_row_major)
      source.append("      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        col * d_mat_col_inc                        ] * val; \n");
    else if (B_transposed && !B_row_major)
      source.append("      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc)                       + (d_mat_col_start +        col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
    else if (!B_transposed && B_row_major)
      source.append("      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + result_col * d_mat_col_inc                        ] * val; \n");
    else
      source.append("      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc)                       + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
    source.append("      } \n");

    source.append("    } \n");

    source.append("    uint col_begin = csr_rows[row_id]; \n");
    source.append("    uint col_end   = csr_rows[row_id + 1]; \n");

    source.append("    for (uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
    if (B_transposed && B_row_major)
      source.append("      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + csr_cols[item_id] * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
    else if (B_transposed && !B_row_major)
      source.append("      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc)                       + (d_mat_col_start + csr_cols[item_id] * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
    else if (!B_transposed && B_row_major)
      source.append("      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        result_col * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
    else
      source.append("      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc)                       + (d_mat_col_start +        result_col * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
    source.append("    } \n");

    if (C_row_major)
      source.append("      result[ (result_row_start + row_id * result_row_inc) * result_internal_cols + result_col_start + result_col * result_col_inc ] = sum; \n");
    else
      source.append("      result[ (result_row_start + row_id * result_row_inc)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = sum; \n");
    source.append("   } \n");
    source.append("  } \n");
    source.append("} \n");
  }
예제 #30
0
void generate_ilu_scale_kernel_1(StringT & source, std::string const & numeric_string)
{
  source.append("__kernel void ilu_scale_kernel_1( \n");
  source.append("  __global unsigned int const *A_row_indices, \n");
  source.append("  __global unsigned int const *A_col_indices, \n");
  source.append("  __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
  source.append("  unsigned int A_size1, \n");
  source.append("  __global "); source.append(numeric_string); source.append("       *D_elements) { \n");

  source.append("  for (unsigned int row  = get_global_id(0); \n");
  source.append("                    row  < A_size1; \n");
  source.append("                    row += get_global_size(0)) \n");
  source.append("  { \n");
  source.append("    unsigned int row_begin = A_row_indices[row]; \n");
  source.append("    unsigned int row_end   = A_row_indices[row+1]; \n");

  source.append("    for (unsigned int j=row_begin; j<row_end; ++j) { \n");
  source.append("      unsigned int col = A_col_indices[j]; \n");

  source.append("      if (col == row) { \n");
  source.append("        D_elements[row] = 1 / sqrt(fabs(A_elements[j])); \n");
  source.append("        break; \n");
  source.append("      } \n");
  source.append("    } \n");

  source.append("  } \n");
  source.append("} \n");
}