Example #1
0
bool run_add() {
  
  constexpr size_t N = 64;
  std::vector<T> host_input(N);
  std::vector<T> host_expected(N);
  for (int i = 0; i < N; ++i) {
    host_input[i] = (T)i;
    host_expected[i] = host_input[i] + host_input[i];
  }

  T* input1;
  hipMalloc(&input1, N * sizeof(T));
  hipMemcpy(input1, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);


  T* input2;
  hipMalloc(&input2, N * sizeof(T));
  hipMemcpy(input2, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);


  constexpr unsigned int blocks = 1;
  constexpr unsigned int threads_per_block = 1;
  hipLaunchKernelGGL(add<T>, dim3(blocks), dim3(threads_per_block), 0, 0, input1, input2, N);

  hipMemcpy(host_input.data(), input1, host_input.size()*sizeof(T), hipMemcpyDeviceToHost);

  bool equal = true;
  for (int i = 0; i < N; i++) {
    equal &= (host_input[i] == host_expected[i]);
  }
  return equal;
}
int test_gl2(size_t N) {

    size_t Nbytes = N*sizeof(int);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);



    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    // Full vadd in one large chunk, to get things started:
    HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);

    HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

    HIPCHECK (hipDeviceSynchronize());

    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    return 0;

}
Example #3
0
bool run_rint() {
    double *A, *Ad;
    double *B, *Bd;
    A = new double[N];
    B = new double[N];
    for (int i = 0; i < N; i++) {
        A[i] = 1.345;
    }
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        double x = round(A[i]);
        if (B[i] == x) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    hipFree(Ad);
    hipFree(Bd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
int main(){
    int A=0, *Ad;
    hipMalloc((void**)&Ad, SIZE);
    hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
}
Example #5
0
int main(int argc, char *argv[])
{ int warpSize, pshift;
  hipDeviceProp_t devProp;
  hipDeviceGetProperties(&devProp, 0);
  if(strncmp(devProp.name,"Fiji",1)==0)  {warpSize =64; pshift =6;}
  else {warpSize =32; pshift =5;}
  unsigned int Num_Threads_per_Block      = 512;
  unsigned int Num_Blocks_per_Grid        = 1;
  unsigned int Num_Warps_per_Block        = Num_Threads_per_Block/warpSize;
  unsigned int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;
  unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int));
  unsigned int* device_ballot; 
  HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int)));
  int divergent_count =0;
  for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0;

 
  HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice));

  hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift);


  HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

     if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n";

     else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n";
	  divergent_count++;}
}

if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n");
  return EXIT_SUCCESS;

}
bool run_lround(){
double *A, *Ad;
long int *B, *Bd;
A = new double[N];
B = new long int[N];
for(int i=0;i<N;i++){
A[i] = 1.345;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, N*sizeof(long int));
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, N*sizeof(long int), hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    long int x = round(A[i]);
    if(B[i] == x){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_rnorm(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 0.0;
val += 1.0;
}
val = 1/sqrt(val);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[0] - val < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_rnorm3d(){
double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd;
A = new double[N];
B = new double[N];
C = new double[N];
D = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 2.0;
C[i] = 3.0;
}
val = 1/sqrt(1.0 + 4.0 + 9.0);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMalloc((void**)&Dd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(D[i] - val < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_erfinv(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
for(int i=0;i<N;i++){
A[i] = -0.6;
B[i] = 0.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[i] - A[i] < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_sincos(){
double *A, *Ad, *B, *C, *Bd, *Cd;
A = new double[N];
B = new double[N];
C = new double[N];
for(int i=0;i<N;i++){
A[i] = 1.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[i] == sin(1.0)){
        passed = 1;
    }
}
passed = 0;
for(int i=0;i<512;i++){
    if(C[i] == cos(1.0)){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
void run(size_t size, hipStream_t stream1, hipStream_t stream2){
	float *Ah, *Bh, *Cd, *Dd, *Eh;
	float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cdd, size));
	HIPCHECK(hipMalloc(&Ddd, size));
	HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault));

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd);
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] = Ah[10] + 1.0f);
	HIPASSERT(Ehh[10] = Ahh[10] + 1.0f);
}
int main(int argc, char *argv[])
{ int warpSize, pshift;
  hipDeviceProp_t devProp;
  hipGetDeviceProperties(&devProp, 0);
  if(strncmp(devProp.name,"Fiji",1)==0)
{ warpSize =64;
  pshift =6;
}
  else {warpSize =32; pshift=5;}
  int anycount =0;
  int allcount =0;
  int Num_Threads_per_Block      = 1024;
  int Num_Blocks_per_Grid        = 1;
  int Num_Warps_per_Block        = Num_Threads_per_Block/warpSize;
  int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;

  int * host_any  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int * host_all  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int *device_any;
  int *device_all;
  HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
  HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
for (int i=0; i<Num_Warps_per_Grid; i++)
{
	host_any[i] = 0;
	host_all[i] = 0;
}
  HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice));

  hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block,pshift);


  HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

    printf("warp no. %d __any = %d \n",i,host_any[i]);
    printf("warp no. %d __all = %d \n",i,host_all[i]);

    if (host_all[i]!=1) ++allcount;
#if defined (__HIP_PLATFORM_HCC__) &&  !defined ( NVCC_COMPAT )
    if (host_any[i]!=64) ++anycount;
#else
    if (host_any[i]!=1) ++anycount;
#endif
}

#if defined (__HIP_PLATFORM_HCC__) &&  !defined ( NVCC_COMPAT )
    if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#else
    if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#endif

  return EXIT_SUCCESS;

}
int main() {
    hipLaunchKernelGGL(
        compileDoublePrecisionMathOnDevice,
        dim3(1, 1, 1),
        dim3(1, 1, 1),
        0,
        0,
        1);
    passed();
}
    void operator()(dim3 *grid_dim, dim3 *block_dim, int x, int y, int z)
    {
        if (y >= 4) {
            *block_dim = dim3(128, 4, 1);
        } else {
            *block_dim = dim3(512, 1, 1);
        }

        grid_dim->x = divide_and_round_up(x, block_dim->x);
        grid_dim->y = divide_and_round_up(y, block_dim->y);
        grid_dim->z = divide_and_round_up(z, block_dim->z);
    }
void BlockArrangement::ArrangePrefer3dLocality(dim3* grid, dim3* block,
                                               const uint3& volume_size)
{
    if (!grid || !block)
        return;

    int bw = 8;
    int bh = 8;
    int bd = 8;
    *block = dim3(bw, bh, bd);
    *grid = dim3((volume_size.x + bw - 1) / bw, (volume_size.y + bh - 1) / bh,
                 (volume_size.z + bd - 1) / bd);
}
Example #16
0
void runTest(int argc, char **argv)
{
    hipDeviceProp_t deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev = 0;

    hipDeviceGetProperties(&deviceProp, dev);

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, "
           "SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    int version = (deviceProp.major * 0x10 + deviceProp.minor);

    unsigned int numThreads = 256;
    unsigned int numBlocks = 64;
    unsigned int numData = 11;
    unsigned int memSize = sizeof(int) * numData;

    //allocate mem for the result on host side
    int *hOData = (int *) malloc(memSize);

    //initialize the memory
    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;

    //To make the AND and XOR tests generate something other than 0...
    hOData[8] = hOData[10] = 0xff;

    // allocate device memory for result
    int *dOData;
    hipMalloc((void **) &dOData, memSize);
    // copy host memory to device to initialize to zero
    hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice);

    // execute the kernel
    hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);

    //Copy result from device to host
    hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost);

    // Compute reference solution
    testResult = computeGold(hOData, numThreads * numBlocks);

    // Cleanup memory
    free(hOData);
    hipFree(dOData);
}
Example #17
0
bool run_rnorm4d() {
    double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd, *E, *Ed;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    D = new double[N];
    E = new double[N];
    double val = 0.0;
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
        B[i] = 2.0;
        C[i] = 3.0;
        D[i] = 4.0;
    }
    val = 1 / sqrt(1.0 + 4.0 + 9.0 + 16.0);
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMalloc((void**)&Dd, SIZE);
    hipMalloc((void**)&Ed, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (E[i] - val < 0.000001) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    delete[] D;
    delete[] E;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);
    hipFree(Dd);
    hipFree(Ed);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
Example #18
0
void CUDARenderer::windowResize(int width, int height) 
{
	skipStep = true;
	h_info.setWidthHeight(width, height);
	camera.width = h_info.width;
	camera.height = h_info.height;
	setImageInfo(h_info);
	dimBlock = dim3(THREAD_DIM, THREAD_DIM, 1);
	dimGrid = dim3(h_info.width / dimBlock.x + (h_info.width % dimBlock.x > 0), h_info.height / dimBlock.y + (h_info.height % dimBlock.y > 0), 1);

	deleteTexture();
	OGLtexture = Texture2D(GL_TEXTURE_2D);
	initTexture();
}
Example #19
0
void CUDARenderer::initCUDA()
{
	gpuErrchk(cudaSetDevice(0));
	gpuErrchk(cudaGLSetGLDevice(0));
	gpuErrchk(cudaMalloc((void **)&d_scene, sizeof(SlowScene)));
	gpuErrchk(cudaMemcpy(d_scene, &h_scene, sizeof(SlowScene), cudaMemcpyHostToDevice));

	setImageInfo(h_info);
	gpuErrchk(cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 1024 * 10));

	dimBlock = dim3(THREAD_DIM, THREAD_DIM, 1);
	dimGrid = dim3(h_info.width / dimBlock.x + (h_info.width % dimBlock.x > 0), h_info.height / dimBlock.y + (h_info.height % dimBlock.y > 0), 1);

	snapshot.init();
}
Example #20
0
int main() {
    size_t Nbytes = N * sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
    int *A_h, *B_h, *C_h;
    hipStream_t s;

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMalloc(&X_d, Nbytes));
        HIPCHECK(hipMalloc(&Y_d, Nbytes));
        HIPCHECK(hipMalloc(&Z_d, Nbytes));


        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipStreamCreate(&s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
        HIPCHECK(hipDeviceSynchronize());

        HipTest::checkVectorADD(A_h, B_h, C_h, N);
        HIPCHECK(hipStreamDestroy(s));
        HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
    }

    passed();
}
/**
 * Override
 */
Animable_I<uchar>* VagueGrayProvider::createAnimable()
    {
    // Animation;
    float dt = 2 * PI / 1000;

    // Dimension
    int dw = 16 * 60 * 2;
    int dh = 16 * 60;

    // Grid Cuda
    dim3 dg = dim3(32, 2, 1);  		// disons a optimiser, depend du gpu
    dim3 db = dim3(16, 16, 1);   	// disons a optimiser, depend du gpu
    Grid grid(dg,db);

    return new VagueGray(grid,dw, dh, dt);
    }
int main()
{
	float *A, *Ad;
	for(int i=0;i<len;i++)
	{
		A[i] = 1.0f;
	}
	Ad = (float*)mallocHip(size);
	memcpyHipH2D(Ad, A, size);
	hipLaunchKernel(HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, A);
	memcpyHipD2H(A, Ad, size);
	for(int i=0;i<len;i++)
	{
		assert(A[i] == 2.0f);
	}
}
Example #23
0
        __host__ __device__
        static void supported_path(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task)
        {
#if __BULK_HAS_CUDART__
#  ifndef __CUDA_ARCH__
          cudaConfigureCall(dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream);
          cudaSetupArgument(task, 0);
          bulk::detail::throw_on_error(cudaLaunch(super_t::global_function_pointer()), "after cudaLaunch in triple_chevron_launcher::launch()");
#  else
          void *param_buffer = cudaGetParameterBuffer(alignment_of<task_type>::value, sizeof(task_type));
          std::memcpy(param_buffer, &task, sizeof(task_type));
          bulk::detail::throw_on_error(cudaLaunchDevice(reinterpret_cast<void*>(super_t::global_function_pointer()), param_buffer, dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream),
                                       "after cudaLaunchDevice in triple_chevron_launcher::launch()");
#  endif // __CUDA_ARCH__
#endif // __BULK_HAS_CUDART__
        }
Example #24
0
int main(int argc, char *argv[])
{
	float *A_d, *C_d;
	float *A_h, *C_h;
	size_t N = 1000000;
	size_t Nbytes = N * sizeof(float);

	hipDeviceProp_t props;
	CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/));
	printf ("info: running on device %s\n", props.name);

	printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
	A_h = (float*)malloc(Nbytes);
	CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
	C_h = (float*)malloc(Nbytes);
	CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
	// Fill with Phi + i
    for (size_t i=0; i<N; i++)
	{
		A_h[i] = 1.618f + i;
	}

	printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
	CHECK(hipMalloc(&A_d, Nbytes));
	CHECK(hipMalloc(&C_d, Nbytes));


	printf ("info: copy Host2Device\n");
    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));

	const unsigned blocks = 512;
	const unsigned threadsPerBlock = 256;

	printf ("info: launch 'vector_square' kernel\n");
	hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

	printf ("info: copy Device2Host\n");
    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

	printf ("info: check result\n");
    for (size_t i=0; i<N; i++)  {
		if (C_h[i] != A_h[i] * A_h[i]) {
			CHECK(hipErrorUnknown);
		}
	}
	printf ("PASSED!\n");
}
void BlockArrangement::ArrangeLinear(dim3* grid, dim3* block,
                                     int num_of_elements)
{
    if (!grid || !block)
        return;

    int max_threads = dev_prop_->maxThreadsPerBlock;
    int num_of_blocks = (num_of_elements + max_threads - 1) / max_threads;
    num_of_blocks = std::max(1, num_of_blocks);

    int num_of_threads = max_threads;
    if (num_of_blocks == 1)
        num_of_threads = std::max(1, num_of_elements);

    *block = dim3(num_of_threads, 1, 1);
    *grid = dim3(num_of_blocks, 1, 1);
}
Example #26
0
void runbench(double *cd, long size){
	if( memory_ratio>UNROLL_ITERATIONS ){
		fprintf(stderr, "ERROR: memory_ratio exceeds UNROLL_ITERATIONS\n");
		exit(1);
	}

	const long compute_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/2;
	const int BLOCK_SIZE = 256;
	const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE;
	const long long computations = 2*(long long)(COMP_ITERATIONS)*REGBLOCK_SIZE*compute_grid_size;
	const long long memoryoperations = (long long)(COMP_ITERATIONS)*compute_grid_size;

	dim3 dimBlock(BLOCK_SIZE, 1, 1);
    dim3 dimGrid(TOTAL_BLOCKS, 1, 1);
	hipEvent_t start, stop;

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd);
	float kernel_time_mad_sp = finalizeEvents(start, stop);

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd);
	float kernel_time_mad_dp = finalizeEvents(start, stop);

	initializeEvents(&start, &stop);
	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd);
	float kernel_time_mad_int = finalizeEvents(start, stop);

	const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS;
	const double computations_ratio = 1.0-memaccesses_ratio;

	printf("         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
		UNROLL_ITERATIONS-memory_ratio,
		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)),
		kernel_time_mad_sp,
		(computations_ratio*(double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000),
		(memaccesses_ratio*(double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.),
		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(double)),
		kernel_time_mad_dp,
		(computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000),
		(memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.),
		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)),
		kernel_time_mad_int,
		(computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000),
		(memaccesses_ratio*(double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) );
}
void BlockArrangement::ArrangeRowScan(dim3* grid, dim3* block, 
                                      const uint3& volume_size)
{
    if (!grid || !block || !volume_size.x)
        return;

    int max_threads = dev_prop_->maxThreadsPerBlock >> 1;
    int bw = volume_size.x;
    int bh = std::min(max_threads / bw, static_cast<int>(volume_size.y));
    int bd = std::min(max_threads / bw / bh, static_cast<int>(volume_size.z));
    if (!bh || !bd)
        return;

    *block = dim3(bw, bh, bd);
    *grid = dim3((volume_size.x + bw - 1) / bw, (volume_size.y + bh - 1) / bh,
                 (volume_size.z + bd - 1) / bd);
}
Example #28
0
int main(){
    setup();
    int *A, *Ad;
    for(int i=0;i<NUM_SIZE;i++){
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;

        for(int j=0;j<NUM_ITER;j++){
            std::cout<<"Iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
            hipLaunchKernel(Add, dim3(1), dim3(size[i]/sizeof(int)), 0, 0, Ad);
            hipMemcpy(A, Ad, size[i], hipMemcpyDeviceToHost);
        }

        hipDeviceSynchronize();
    }
}
Example #29
0
int main(){
    float *A, *Ad, *B, *Bd, *C, *Cd;
    A = new float[LEN];
    B = new float[LEN];
    C = new float[LEN];
    for(uint32_t i=0;i<LEN;i++){
        A[i] = i*1.0f;
        B[i] = i*1.0f;
        C[i] = i*1.0f;
    }

    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
}
void BlockArrangement::ArrangeSequential(dim3* grid, dim3* block,
                                         const uint3& volume_size)
{
    if (!block || !grid || !volume_size.x)
        return;

    int max_threads = dev_prop_->maxThreadsPerBlock;
    int bw = max_threads;
    int bh = 1;
    int bd = 1;

    int elements = volume_size.x * volume_size.y * volume_size.z;
    int blocks = std::max(1, elements / (bw * 2));

    const int kMaxBlocks = 64;
    blocks = std::min(kMaxBlocks, blocks);

    *block = dim3(bw, bh, bd);
    *grid = dim3(blocks, 1, 1);
}