void compiler_bitcast_int2_to_long(void)
{
  const size_t n = 64;
  const int v = 2;
  int src[n * v];
  uint64_t *dst = (uint64_t *)src;

  // Setup kernel and buffers
  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_bitcast", "compiler_bitcast_int2_to_long");
  OCL_CREATE_BUFFER(buf[0], 0, sizeof(src), NULL);
  OCL_CREATE_BUFFER(buf[1], 0, sizeof(src), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  globals[0] = n;
  locals[0] = 16;

  for (int32_t i = 0; i < (int32_t) n*v; ++i) {
    src[i] = (int)rand();
  }

  OCL_MAP_BUFFER(0);
  memcpy(buf_data[0], src, sizeof(src));
  OCL_UNMAP_BUFFER(0);

  // Run the kernel on GPU
  OCL_NDRANGE(1);

  // Compare
  OCL_MAP_BUFFER(1);
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    OCL_ASSERT(((uint64_t *)(buf_data[1]))[i] == dst[i]);
    //printf("ref is 0x%lx, result is 0x%lx\n", dst[i], ((int64_t *)(buf_data[1]))[i]);
  }
  OCL_UNMAP_BUFFER(1);
}
static void test_exec(const char* kernel_name)
{
  const size_t n = 160;

  // Setup kernel and buffers
  OCL_CREATE_KERNEL_FROM_FILE("compiler_basic_arithmetic", kernel_name);
std::cout <<"kernel name: " << kernel_name << std::endl;
  buf_data[0] = (T*) malloc(sizeof(T) * n);
  buf_data[1] = (T*) malloc(sizeof(T) * n);
  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = (T) rand();
  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[1])[i] = (T) rand();
  if(op == TEST_OP_DIV || op == TEST_OP_REM) {
    for (uint32_t i = 0; i < n; ++i) {
      if(((T*)buf_data[1])[i] == 0)
       ((T*)buf_data[1])[i] = (T) 1;
    }
  }
  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[1]);
  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);

  // Run the kernel
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
  globals[0] = n;
  locals[0] = 16;
  OCL_NDRANGE(1);

  // Check result
  OCL_MAP_BUFFER(2);
  if(op == TEST_OP_SUB) {
    for (uint32_t i = 0; i < n; ++i)
      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] - ((T*)buf_data[1])[i]));
  } else if(op == TEST_OP_ADD) {
    for (uint32_t i = 0; i < n; ++i)
      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] + ((T*)buf_data[1])[i]));
  } else if(op == TEST_OP_MUL) {
    for (uint32_t i = 0; i < n; ++i)
      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] * ((T*)buf_data[1])[i]));
  } else if(op == TEST_OP_DIV) {
    for (uint32_t i = 0; i < n; ++i)
      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] / ((T*)buf_data[1])[i]));
  } else {
    for (uint32_t i = 0; i < n; ++i)
      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] % ((T*)buf_data[1])[i]));
  }
  free(buf_data[0]);
  free(buf_data[1]);
  buf_data[0] = buf_data[1] = NULL;
}
void compiler_local_slm1(void)
{
  const size_t n = 2;
  OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm1");
  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  globals[0] = 1;
  locals[0] = 1;
  OCL_NDRANGE(1);
  OCL_MAP_BUFFER(0);
  uint64_t * ptr = (uint64_t*)buf_data[0];
  OCL_ASSERT((ptr[1] -ptr[0])  == 4);
  OCL_UNMAP_BUFFER(0);
}
void compiler_local_slm(void)
{
  const size_t n = 32;
  OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm");
  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  globals[0] = n;
  locals[0] = 16;
  OCL_NDRANGE(1);
  OCL_MAP_BUFFER(0);
  for (uint32_t i = 0; i < n; ++i)
//    std::cout << ((int32_t*)buf_data[0])[i] << std::endl;
    OCL_ASSERT(((int32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16));
  OCL_UNMAP_BUFFER(0);
}
// convert 64-bit integer to shorter integer
void compiler_long_convert_2(void)
{
  const size_t n = 16;
  int64_t src[n];

  // Setup kernel and buffers
  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_2");
  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
  globals[0] = n;
  locals[0] = 16;

  // Run random tests
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    src[i] = -i;
  }
  OCL_MAP_BUFFER(3);
  memcpy(buf_data[3], src, sizeof(src));
  OCL_UNMAP_BUFFER(3);

  // Run the kernel on GPU
  OCL_NDRANGE(1);

  // Compare
  OCL_MAP_BUFFER(0);
  OCL_MAP_BUFFER(1);
  OCL_MAP_BUFFER(2);
  char *dst1 = ((char *)buf_data[0]);
  short *dst2 = ((short *)buf_data[1]);
  int *dst3 = ((int *)buf_data[2]);
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    //printf("%x %x %x\n", dst1[i], dst2[i], dst3[i]);
    OCL_ASSERT(dst1[i] == -i);
    OCL_ASSERT(dst2[i] == -i);
    OCL_ASSERT(dst3[i] == -i);
  }
  OCL_UNMAP_BUFFER(0);
  OCL_UNMAP_BUFFER(1);
  OCL_UNMAP_BUFFER(2);
}
// convert 64-bit integer to 32-bit float
void compiler_long_convert_to_float(void)
{
  const size_t n = 16;
  int64_t src[n];

  // Setup kernel and buffers
  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_to_float");
  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  globals[0] = n;
  locals[0] = 16;

  // Run random tests
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    src[i] = -(int64_t)i;
  }
  OCL_MAP_BUFFER(1);
  memcpy(buf_data[1], src, sizeof(src));
  OCL_UNMAP_BUFFER(1);

  // Run the kernel on GPU
  OCL_NDRANGE(1);

  // Compare
  OCL_MAP_BUFFER(0);
  OCL_MAP_BUFFER(1);
  float *dst = ((float *)buf_data[0]);
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    //printf("%f\n", dst[i]);
    OCL_ASSERT(dst[i] == src[i]);
  }
  OCL_UNMAP_BUFFER(0);
  OCL_UNMAP_BUFFER(1);
}