bool run_rnorm(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 0.0;
val += 1.0;
}
val = 1/sqrt(val);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[0] - val < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_erfinv(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
for(int i=0;i<N;i++){
A[i] = -0.6;
B[i] = 0.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[i] - A[i] < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_rnorm3d(){
double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd;
A = new double[N];
B = new double[N];
C = new double[N];
D = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 2.0;
C[i] = 3.0;
}
val = 1/sqrt(1.0 + 4.0 + 9.0);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMalloc((void**)&Dd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(D[i] - val < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_sincos(){
double *A, *Ad, *B, *C, *Bd, *Cd;
A = new double[N];
B = new double[N];
C = new double[N];
for(int i=0;i<N;i++){
A[i] = 1.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[i] == sin(1.0)){
        passed = 1;
    }
}
passed = 0;
for(int i=0;i<512;i++){
    if(C[i] == cos(1.0)){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
int test_gl2(size_t N) {

    size_t Nbytes = N*sizeof(int);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);



    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    // Full vadd in one large chunk, to get things started:
    HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);

    HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

    HIPCHECK (hipDeviceSynchronize());

    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    return 0;

}
int main(){
    int A=0, *Ad;
    hipMalloc((void**)&Ad, SIZE);
    hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
}
Example #7
0
bool run_rint() {
    double *A, *Ad;
    double *B, *Bd;
    A = new double[N];
    B = new double[N];
    for (int i = 0; i < N; i++) {
        A[i] = 1.345;
    }
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        double x = round(A[i]);
        if (B[i] == x) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    hipFree(Ad);
    hipFree(Bd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
Example #8
0
int main(int argc, char *argv[])
{ int warpSize, pshift;
  hipDeviceProp_t devProp;
  hipDeviceGetProperties(&devProp, 0);
  if(strncmp(devProp.name,"Fiji",1)==0)  {warpSize =64; pshift =6;}
  else {warpSize =32; pshift =5;}
  unsigned int Num_Threads_per_Block      = 512;
  unsigned int Num_Blocks_per_Grid        = 1;
  unsigned int Num_Warps_per_Block        = Num_Threads_per_Block/warpSize;
  unsigned int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;
  unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int));
  unsigned int* device_ballot; 
  HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int)));
  int divergent_count =0;
  for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0;

 
  HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice));

  hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift);


  HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

     if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n";

     else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n";
	  divergent_count++;}
}

if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n");
  return EXIT_SUCCESS;

}
bool run_lround(){
double *A, *Ad;
long int *B, *Bd;
A = new double[N];
B = new long int[N];
for(int i=0;i<N;i++){
A[i] = 1.345;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, N*sizeof(long int));
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, N*sizeof(long int), hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    long int x = round(A[i]);
    if(B[i] == x){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
Example #10
0
bool run_add() {
  
  constexpr size_t N = 64;
  std::vector<T> host_input(N);
  std::vector<T> host_expected(N);
  for (int i = 0; i < N; ++i) {
    host_input[i] = (T)i;
    host_expected[i] = host_input[i] + host_input[i];
  }

  T* input1;
  hipMalloc(&input1, N * sizeof(T));
  hipMemcpy(input1, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);


  T* input2;
  hipMalloc(&input2, N * sizeof(T));
  hipMemcpy(input2, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);


  constexpr unsigned int blocks = 1;
  constexpr unsigned int threads_per_block = 1;
  hipLaunchKernelGGL(add<T>, dim3(blocks), dim3(threads_per_block), 0, 0, input1, input2, N);

  hipMemcpy(host_input.data(), input1, host_input.size()*sizeof(T), hipMemcpyDeviceToHost);

  bool equal = true;
  for (int i = 0; i < N; i++) {
    equal &= (host_input[i] == host_expected[i]);
  }
  return equal;
}
int main(){
    float *Ad, *A;
    hipHostMalloc((void**)&A, size);
    hipMalloc((void**)&Ad, size);
    assert(hipSuccess == hipMemcpy(Ad, A, size, hipMemcpyHostToDevice));
    assert(hipSuccess == hipMemcpy(A, Ad, size, hipMemcpyDeviceToHost));
    passed();
}
int main(int argc, char *argv[])
{ int warpSize, pshift;
  hipDeviceProp_t devProp;
  hipGetDeviceProperties(&devProp, 0);
  if(strncmp(devProp.name,"Fiji",1)==0)
{ warpSize =64;
  pshift =6;
}
  else {warpSize =32; pshift=5;}
  int anycount =0;
  int allcount =0;
  int Num_Threads_per_Block      = 1024;
  int Num_Blocks_per_Grid        = 1;
  int Num_Warps_per_Block        = Num_Threads_per_Block/warpSize;
  int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;

  int * host_any  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int * host_all  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int *device_any;
  int *device_all;
  HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
  HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
for (int i=0; i<Num_Warps_per_Grid; i++)
{
	host_any[i] = 0;
	host_all[i] = 0;
}
  HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice));

  hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block,pshift);


  HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

    printf("warp no. %d __any = %d \n",i,host_any[i]);
    printf("warp no. %d __all = %d \n",i,host_all[i]);

    if (host_all[i]!=1) ++allcount;
#if defined (__HIP_PLATFORM_HCC__) &&  !defined ( NVCC_COMPAT )
    if (host_any[i]!=64) ++anycount;
#else
    if (host_any[i]!=1) ++anycount;
#endif
}

#if defined (__HIP_PLATFORM_HCC__) &&  !defined ( NVCC_COMPAT )
    if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#else
    if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#endif

  return EXIT_SUCCESS;

}
int main(){
    int A=0, *Ad;
    hipMalloc((void**)&Ad, SIZE);
    hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice);
    dim3 dimGrid, dimBlock;
    dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1;
    dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1;
    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
}
Example #14
0
bool run_rnorm4d() {
    double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd, *E, *Ed;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    D = new double[N];
    E = new double[N];
    double val = 0.0;
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
        B[i] = 2.0;
        C[i] = 3.0;
        D[i] = 4.0;
    }
    val = 1 / sqrt(1.0 + 4.0 + 9.0 + 16.0);
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMalloc((void**)&Dd, SIZE);
    hipMalloc((void**)&Ed, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (E[i] - val < 0.000001) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    delete[] D;
    delete[] E;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);
    hipFree(Dd);
    hipFree(Ed);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
Example #15
0
void runTest(int argc, char **argv)
{
    hipDeviceProp_t deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev = 0;

    hipDeviceGetProperties(&deviceProp, dev);

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, "
           "SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    int version = (deviceProp.major * 0x10 + deviceProp.minor);

    unsigned int numThreads = 256;
    unsigned int numBlocks = 64;
    unsigned int numData = 11;
    unsigned int memSize = sizeof(int) * numData;

    //allocate mem for the result on host side
    int *hOData = (int *) malloc(memSize);

    //initialize the memory
    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;

    //To make the AND and XOR tests generate something other than 0...
    hOData[8] = hOData[10] = 0xff;

    // allocate device memory for result
    int *dOData;
    hipMalloc((void **) &dOData, memSize);
    // copy host memory to device to initialize to zero
    hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice);

    // execute the kernel
    hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);

    //Copy result from device to host
    hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost);

    // Compute reference solution
    testResult = computeGold(hOData, numThreads * numBlocks);

    // Cleanup memory
    free(hOData);
    hipFree(dOData);
}
Example #16
0
int main() {
    size_t Nbytes = N * sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
    int *A_h, *B_h, *C_h;
    hipStream_t s;

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMalloc(&X_d, Nbytes));
        HIPCHECK(hipMalloc(&Y_d, Nbytes));
        HIPCHECK(hipMalloc(&Z_d, Nbytes));


        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipStreamCreate(&s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
        HIPCHECK(hipDeviceSynchronize());

        HipTest::checkVectorADD(A_h, B_h, C_h, N);
        HIPCHECK(hipStreamDestroy(s));
        HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
    }

    passed();
}
Example #17
0
int main(int argc, char *argv[])
{
	float *A_d, *C_d;
	float *A_h, *C_h;
	size_t N = 1000000;
	size_t Nbytes = N * sizeof(float);

	hipDeviceProp_t props;
	CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/));
	printf ("info: running on device %s\n", props.name);

	printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
	A_h = (float*)malloc(Nbytes);
	CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
	C_h = (float*)malloc(Nbytes);
	CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
	// Fill with Phi + i
    for (size_t i=0; i<N; i++)
	{
		A_h[i] = 1.618f + i;
	}

	printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
	CHECK(hipMalloc(&A_d, Nbytes));
	CHECK(hipMalloc(&C_d, Nbytes));


	printf ("info: copy Host2Device\n");
    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));

	const unsigned blocks = 512;
	const unsigned threadsPerBlock = 256;

	printf ("info: launch 'vector_square' kernel\n");
	hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

	printf ("info: copy Device2Host\n");
    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

	printf ("info: check result\n");
    for (size_t i=0; i<N; i++)  {
		if (C_h[i] != A_h[i] * A_h[i]) {
			CHECK(hipErrorUnknown);
		}
	}
	printf ("PASSED!\n");
}
Example #18
0
int main(){
    setup();
    int *A, *Ad;
    for(int i=0;i<NUM_SIZE;i++){
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;

        for(int j=0;j<NUM_ITER;j++){
            std::cout<<"Iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
            hipLaunchKernel(Add, dim3(1), dim3(size[i]/sizeof(int)), 0, 0, Ad);
            hipMemcpy(A, Ad, size[i], hipMemcpyDeviceToHost);
        }

        hipDeviceSynchronize();
    }
}
Example #19
0
int main(){
    float *A, *Ad, *B, *Bd, *C, *Cd;
    A = new float[LEN];
    B = new float[LEN];
    C = new float[LEN];
    for(uint32_t i=0;i<LEN;i++){
        A[i] = i*1.0f;
        B[i] = i*1.0f;
        C[i] = i*1.0f;
    }

    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
}
Example #20
0
bool run_sincospi() {
    double *A, *Ad, *B, *C, *Bd, *Cd;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
    }
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (B[i] - sin(3.14 * 1.0) < 0.1) {
            passed = 1;
        }
    }
    passed = 0;
    for (int i = 0; i < 512; i++) {
        if (C[i] - cos(3.14 * 1.0) < 0.1) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
Example #21
0
int main(int argc, char *argv[])
{

  int Num_Threads_per_Block      = 1024;
  int Num_Blocks_per_Grid        = 1;
  int Num_Warps_per_Block        = Num_Threads_per_Block/64;
  int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/64;
  
  int * host_any  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int * host_all  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int *device_any; 
  int *device_all;
  HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
  HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
for (int i=0; i<Num_Warps_per_Grid; i++) 
{
	host_any[i] = 0;
	host_all[i] = 0;
}
  HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice));

  hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block);


  HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

    printf("warp no. %d __any = %d \n",i,host_any[i]);
    printf("warp no. %d __all = %d \n",i,host_all[i]);


}


  return EXIT_SUCCESS;

}
Example #22
0
bool run_rhypot() {
    double *A, *Ad, *B, *Bd, *C, *Cd;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    double val = 0.0;
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
        B[i] = 2.0;
    }
    val = 1 / sqrt(1.0 + 4.0);
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (C[i] - val < 0.000001) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
int main() {
    setup();
    int *A, *Ad;
    for(int i=0; i<NUM_SIZE; i++) {
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;
        for(int j=0; j<NUM_ITER; j++) {
            std::cout<<"\r"<<"Iter: "<<j;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
        }
        std::cout<<std::endl;
        hipDeviceSynchronize();
    }
}
Example #24
0
int main() {
    setup();
    int *A, *Ad;
    for (int i = 0; i < NUM_SIZE; i++) {
        std::cout << size[i] << std::endl;
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout << "Malloc success at size: " << size[i] << std::endl;
        clock_t start, end;
        start = clock();
        for (int j = 0; j < NUM_ITER; j++) {
            //            std::cout<<"At iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
        }
        hipDeviceSynchronize();
        end = clock();
        double uS = (double)(end - start) * 1000 / (NUM_ITER * CLOCKS_PER_SEC);
        std::cout << uS << std::endl;
    }
}
void _d2h(mem_manager *self)
{
    hipMemcpy(self->hst_ptr, self->dev_ptr, self->size, hipMemcpyDeviceToHost);
}
void _h2d(mem_manager *self){
    hipMemcpy(self->dev_ptr, self->hst_ptr, self->size, hipMemcpyHostToDevice);
}
int main(){

	hipError_t err;
	float *A, *Ad;

	A = new float[LEN];

	for(int i=0;i<LEN;i++){
		A[i] = 1.0f;
	}

	hipStream_t stream;
	err = hipStreamCreate(&stream);
	check("Creating stream",err);

	err = hipMalloc(&Ad, SIZE);
	check("Allocating Ad memory on device", err);

	err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
	check("Doing memory copy from A to Ad", err);

	float mS = 0;
	hipEvent_t start, stop;
	hipEventCreate(&start);
	hipEventCreate(&stop);

	ResultDatabase resultDB[8];

	hipEventRecord(start);
	hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); 
//	std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl;
	resultDB[0].DumpSummary(std::cout);
	hipEventRecord(start);
	hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); 
//	std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl;
	resultDB[1].DumpSummary(std::cout);
	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	}
	hipDeviceSynchronize();
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); 
	resultDB[2].DumpSummary(std::cout);
//	std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	}
	hipEventRecord(stop);
	hipDeviceSynchronize();
	hipEventElapsedTime(&mS, start, stop);
	resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); 
	resultDB[3].DumpSummary(std::cout);
//	std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad);
		hipDeviceSynchronize();
	}
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); 
	resultDB[4].DumpSummary(std::cout);
//	std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();
	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad);
	}
	hipDeviceSynchronize();
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); 
//	std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	resultDB[5].DumpSummary(std::cout);
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	}
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); 
	resultDB[6].DumpSummary(std::cout);
//	std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad);
	}
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); 
	resultDB[7].DumpSummary(std::cout);
//	std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();
}
void memManager::D2H()
{
    hipMemcpy(hstPtr, devPtr, size, hipMemcpyDeviceToHost);
}
Example #29
0
int main(int argc, char **argv) {
  uchar4        *h_rgbaImage, *d_rgbaImage;
  unsigned char *h_greyImage, *d_greyImage;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW1_output.png";
	  reference_file = "HW1_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW1_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
  timer.Stop();
  hipDeviceSynchronize(); checkCudaErrors(hipGetLastError());

  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  size_t numPixels = numRows()*numCols();
  checkCudaErrors(hipMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, hipMemcpyDeviceToHost));

  //check results and output the grey image
  postProcess(output_file, h_greyImage);

  referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());

  postProcess(reference_file, h_greyImage);

  //generateReferenceImage(input_file, reference_file);
  compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
                globalError);

  cleanup();

  return 0;
}
Example #30
0
// IN: nStreams : number of streams to use for the test
// IN :useNullStream - use NULL stream.  Synchronizes everything.
// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D
// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H
void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D,
                              bool useSyncMemcpyD2H) {
    size_t Nbytes = N * sizeof(int);
    printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d)  ", __func__,
           useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H);
    printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true);


    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
    if (useNullStream) {
        nStreams = 1;
        stream[0] = NULL;
    } else {
        for (int i = 0; i < nStreams; ++i) {
            HIPCHECK(hipStreamCreate(&stream[i]));
        }
    }


    size_t workLeft = N;
    size_t workPerStream = N / nStreams;
    for (int i = 0; i < nStreams; ++i) {
        size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
        size_t workBytes = work * sizeof(int);

        size_t offset = i * workPerStream;
        HIPASSERT(A_d + offset < A_d + Nbytes);
        HIPASSERT(B_d + offset < B_d + Nbytes);
        HIPASSERT(C_d + offset < C_d + Nbytes);
        if (useSyncMemcpyH2D) {
            HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice));
            HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice));
        } else {
            HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice,
                                    stream[i]));
            HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice,
                                    stream[i]));
        };

        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
                        &A_d[offset], &B_d[offset], &C_d[offset], work);

        if (useSyncMemcpyD2H) {
            HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost));
        } else {
            HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost,
                                    stream[i]));
        }
    }


    HIPCHECK(hipDeviceSynchronize());


    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true);

    free(stream);
};