Esempio n. 1
0
int main() {
  {
    struct map map;

    map_init(&map);

    char key1[] = "This is a really long key.",
         key2[] = "Another key.";

    assertError("map_set (1)", map_set(&map, key1, sizeof key1, (void*)123));
    assertEq("Key is now retrievable", map_get(&map, key1, sizeof key1), (void*)123);

    assertError("map_set (2)", map_set(&map, key2, sizeof key2, (void*)456));
    assertEq("New key is now retrievable", map_get(&map, key2, sizeof key2), (void*)456);
    assertEq("Old key is still retrievable (1)", map_get(&map, key1, sizeof key1), (void*)123);

    assertError("map_set existing", map_set(&map, key1, sizeof key1, (void*)789));
    assertEq("Key has updated value", map_get(&map, key1, sizeof key1), (void*)789);
    assertEq("Old key is still retrievable (2)", map_get(&map, key2, sizeof key2), (void*)456);

    assertError("map_set small key (1)", map_set(&map, (void*)1, 0, (void*)10));
    assertError("map_set small key (2)", map_set(&map, (void*)2, 0, (void*)20));
    assertEq("Can retrieve short key (1)", map_get(&map, (void*)1, 0), (void*)10);
    assertEq("Can retrieve short key (2)", map_get(&map, (void*)2, 0), (void*)20);

    map_free(&map);
  }
}
static void customcl_gpu_gemm(const int ctx_id, const int M,
                       const int N, const int K ,
                       const Dtype* A, const Dtype* B, Dtype* C) {
  // implement transpose.
  //std::cout << "addr " << B << std::endl;
  //std::cout << "MNK " << M << " " << N << " " << K << std::endl;
  if(!customcl_is_setup) {
    caffe::customcl_setup();
    customcl_is_setup = true;
  }
  auto queue = viennacl::ocl::get_context(ctx_id).get_queue().handle().get();

  const int align = 32;
  int oK = (K + align - 1) / align * align;
  int oM = (M + align - 1) / align * align;
  int oN = (N + align - 1) / align * align;

  Dtype* copy_buffer = (Dtype*)copy_ptr;
  if(sizeof(Dtype) * oK * oM > MAX_BUFFER_DIM) {
    throw "customcl_gpu_gemm: maximum buffer size exceeded.";
  }

  if(sizeof(Dtype) * oN * oK > MAX_BUFFER_DIM) {
    throw "customcl_gpu_gemm: maximum buffer size exceeded.";
  }

  customcl_copy_matrix(ctx_id,
      A, K, M,
      copy_buffer, oK, oM);

#if CUSTOM_GEMM_VERIFICATION == true
  clEnqueueMapBuffer(
      queue,
      (cl_mem) copy_buffer,
      CL_TRUE,    // blocking map
      CL_MAP_READ,
      0,
      oK * oM * sizeof(Dtype),
      0, 0, 0,
      &err
  );
  SAMPLE_CHECK_ERRORS(err);

  std::cout << "[verify copy] " << std::endl;
  for(size_t i = 0; i < oK; i++) {
    for(size_t j = 0; j < oM; j++) {
      if(i < K and j < M) {
        assertEq(((Dtype*)host_copy_buffer)[j * oK + i], A[j * K + i]);
      }else{
        assertEq(((Dtype*)host_copy_buffer)[j * oK + i], (Dtype)0.);
      }
    }
  }
#endif

  Dtype* trans_buffer = (Dtype*)transpose_ptr;

  customcl_transpose_matrix(ctx_id,
      B, K, N,
      trans_buffer, oN, oK);

#if CUSTOM_GEMM_VERIFICATION == true
  clEnqueueMapBuffer(
      queue,
      (cl_mem) trans_buffer,
      CL_TRUE,    // blocking map
      CL_MAP_READ,
      0,
      N * K * sizeof(Dtype),
      0, 0, 0,
      &err
  );
  SAMPLE_CHECK_ERRORS(err);

  std::cout << "[verifying] " << std::endl;
  for(size_t i = 0; i < N; i++) {
    for(size_t j = 0; j < K; j++) {
      if(B[i * K + j] != host_trans_buffer[j * oN + i]) {
        throw "verifcation failed";
      }
    }
  }
#endif


  cl_kernel kernel = gemm_exec.handle().get();

  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &copy_buffer);
  SAMPLE_CHECK_ERRORS(err);

  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &trans_buffer);
  SAMPLE_CHECK_ERRORS(err);

  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &result_ptr);
  SAMPLE_CHECK_ERRORS(err);

  err = clSetKernelArg(kernel, 3, sizeof(int), &oM);
  SAMPLE_CHECK_ERRORS(err);

  err = clSetKernelArg(kernel, 4, sizeof(int), &oK);
  SAMPLE_CHECK_ERRORS(err);

  err = clSetKernelArg(kernel, 5, sizeof(int), &oN);
  SAMPLE_CHECK_ERRORS(err);

  size_t local_size[2] = {16, 16};
  size_t global_size[2] = {oM / 2, oN / 2};

  err = clEnqueueNDRangeKernel(
      queue,
      kernel,
      2,
      0,
      global_size,
      local_size,
      0, 0, NULL
  );
  SAMPLE_CHECK_ERRORS(err);

  err = clFinish(queue);
  SAMPLE_CHECK_ERRORS(err);

  // copy to output mem.
  customcl_copy_matrix(ctx_id,
      (Dtype*)result_ptr, oN, oM,
      C, N, M);
}