bool run_rnorm(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 0.0;
val += 1.0;
}
val = 1/sqrt(val);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[0] - val < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
void run(size_t size, hipStream_t stream1, hipStream_t stream2){
	float *Ah, *Bh, *Cd, *Dd, *Eh;
	float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cdd, size));
	HIPCHECK(hipMalloc(&Ddd, size));
	HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault));

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd);
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] = Ah[10] + 1.0f);
	HIPASSERT(Ehh[10] = Ahh[10] + 1.0f);
}
bool run_rnorm3d(){
double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd;
A = new double[N];
B = new double[N];
C = new double[N];
D = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 2.0;
C[i] = 3.0;
}
val = 1/sqrt(1.0 + 4.0 + 9.0);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMalloc((void**)&Dd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(D[i] - val < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_lround(){
double *A, *Ad;
long int *B, *Bd;
A = new double[N];
B = new long int[N];
for(int i=0;i<N;i++){
A[i] = 1.345;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, N*sizeof(long int));
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, N*sizeof(long int), hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    long int x = round(A[i]);
    if(B[i] == x){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
bool run_sincos(){
double *A, *Ad, *B, *C, *Bd, *Cd;
A = new double[N];
B = new double[N];
C = new double[N];
for(int i=0;i<N;i++){
A[i] = 1.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[i] == sin(1.0)){
        passed = 1;
    }
}
passed = 0;
for(int i=0;i<512;i++){
    if(C[i] == cos(1.0)){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
Пример #6
0
bool run_add() {
  
  constexpr size_t N = 64;
  std::vector<T> host_input(N);
  std::vector<T> host_expected(N);
  for (int i = 0; i < N; ++i) {
    host_input[i] = (T)i;
    host_expected[i] = host_input[i] + host_input[i];
  }

  T* input1;
  hipMalloc(&input1, N * sizeof(T));
  hipMemcpy(input1, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);


  T* input2;
  hipMalloc(&input2, N * sizeof(T));
  hipMemcpy(input2, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);


  constexpr unsigned int blocks = 1;
  constexpr unsigned int threads_per_block = 1;
  hipLaunchKernelGGL(add<T>, dim3(blocks), dim3(threads_per_block), 0, 0, input1, input2, N);

  hipMemcpy(host_input.data(), input1, host_input.size()*sizeof(T), hipMemcpyDeviceToHost);

  bool equal = true;
  for (int i = 0; i < N; i++) {
    equal &= (host_input[i] == host_expected[i]);
  }
  return equal;
}
Пример #7
0
int main()
{
    float *Ad;
    hipMalloc((void**)&Ad, 1024);

    // Test the different hipLaunchParm options:
    hipLaunchKernel(vAdd, size_t(1024), 1, 0, 0, Ad);
    hipLaunchKernel(vAdd, 1024, dim3(1), 0, 0, Ad);
    hipLaunchKernel(vAdd, dim3(1024), 1, 0, 0, Ad);
    hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad);

    // Test case with hipLaunchKernel inside another macro:
    float e0;
    GPU_PRINT_TIME (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), e0, j);
    GPU_PRINT_TIME (WRAP(hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad)), e0, j);

#ifdef EXTRA_PARENS_1
    // Don't wrap hipLaunchKernel in extra set of parens:
    GPU_PRINT_TIME ((hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad)), e0, j);
#endif

    MY_LAUNCH (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");

    float *A;
    float e1;
    MY_LAUNCH_WITH_PAREN (hipMalloc(&A, 100), true, "launch2");

#ifdef EXTRA_PARENS_2
    //MY_LAUNCH_WITH_PAREN wraps cmd in () which can cause issues.
    MY_LAUNCH_WITH_PAREN (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
#endif

    passed();
}
Пример #8
0
bool run_rint() {
    double *A, *Ad;
    double *B, *Bd;
    A = new double[N];
    B = new double[N];
    for (int i = 0; i < N; i++) {
        A[i] = 1.345;
    }
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        double x = round(A[i]);
        if (B[i] == x) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    hipFree(Ad);
    hipFree(Bd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
bool run_erfinv(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
for(int i=0;i<N;i++){
A[i] = -0.6;
B[i] = 0.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
    if(B[i] - A[i] < 0.000001){
        passed = 1;
    }
}
free(A);
if(passed == 1){
    return true;
}
assert(passed == 1);
return false;
}
int main(int argc, char *argv[])
{ int warpSize, pshift;
  hipDeviceProp_t devProp;
  hipGetDeviceProperties(&devProp, 0);
  if(strncmp(devProp.name,"Fiji",1)==0)
{ warpSize =64;
  pshift =6;
}
  else {warpSize =32; pshift=5;}
  int anycount =0;
  int allcount =0;
  int Num_Threads_per_Block      = 1024;
  int Num_Blocks_per_Grid        = 1;
  int Num_Warps_per_Block        = Num_Threads_per_Block/warpSize;
  int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;

  int * host_any  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int * host_all  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int *device_any;
  int *device_all;
  HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
  HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
for (int i=0; i<Num_Warps_per_Grid; i++)
{
	host_any[i] = 0;
	host_all[i] = 0;
}
  HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice));

  hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block,pshift);


  HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

    printf("warp no. %d __any = %d \n",i,host_any[i]);
    printf("warp no. %d __all = %d \n",i,host_all[i]);

    if (host_all[i]!=1) ++allcount;
#if defined (__HIP_PLATFORM_HCC__) &&  !defined ( NVCC_COMPAT )
    if (host_any[i]!=64) ++anycount;
#else
    if (host_any[i]!=1) ++anycount;
#endif
}

#if defined (__HIP_PLATFORM_HCC__) &&  !defined ( NVCC_COMPAT )
    if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#else
    if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#endif

  return EXIT_SUCCESS;

}
Пример #11
0
int main(){
    float *A, *B, *C;
    hipDeviceptr_t Ad, Bd, Cd;
    A = new float[LEN];
    B = new float[LEN];
    C = new float[LEN];

    for(uint32_t i=0;i<LEN;i++){
        A[i] = i*1.0f;
        B[i] = 1.0f;
        C[i] = 0.0f;
    }

    hipInit(0);
    hipDevice_t device;
    hipCtx_t context;
    hipDeviceGet(&device, 0);
    hipCtxCreate(&context, 0, device);

    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);

    hipMemcpyHtoD(Ad, A, SIZE);
    hipMemcpyHtoD(Bd, B, SIZE);
    hipMemcpyHtoD(Cd, C, SIZE);

    hipModule_t Module;
    hipFunction_t Function;
    hipModuleLoad(&Module, fileName);
    hipModuleGetFunction(&Function, Module, kernel_name);

    int n = LEN;
    void * args[4] = {&Ad, &Bd, &Cd, &n};

    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, args, nullptr);

    hipMemcpyDtoH(C, Cd, SIZE);
    int mismatchCount = 0;
    for(uint32_t i=0;i<LEN;i++){
        if (A[i] + B[i] != C[i]) {
            mismatchCount++;
            std::cout<<"error: mismatch " << A[i]<<" + "<<B[i]<<" != "<<C[i]<<std::endl;
        }
    }

    if (mismatchCount == 0) {
        std::cout << "PASSED!\n";
    } else {
        std::cout << "FAILED!\n";
    };

    hipCtxDestroy(context);
    return 0;
}
Пример #12
0
bool run_rnorm4d() {
    double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd, *E, *Ed;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    D = new double[N];
    E = new double[N];
    double val = 0.0;
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
        B[i] = 2.0;
        C[i] = 3.0;
        D[i] = 4.0;
    }
    val = 1 / sqrt(1.0 + 4.0 + 9.0 + 16.0);
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMalloc((void**)&Dd, SIZE);
    hipMalloc((void**)&Ed, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (E[i] - val < 0.000001) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    delete[] D;
    delete[] E;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);
    hipFree(Dd);
    hipFree(Ed);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
Пример #13
0
int main() {
    size_t Nbytes = N * sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
    int *A_h, *B_h, *C_h;
    hipStream_t s;

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMalloc(&X_d, Nbytes));
        HIPCHECK(hipMalloc(&Y_d, Nbytes));
        HIPCHECK(hipMalloc(&Z_d, Nbytes));


        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipStreamCreate(&s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
        HIPCHECK(hipDeviceSynchronize());

        HipTest::checkVectorADD(A_h, B_h, C_h, N);
        HIPCHECK(hipStreamDestroy(s));
        HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
    }

    passed();
}
Пример #14
0
int main(){
    int A=0, *Ad;
    hipMalloc((void**)&Ad, SIZE);
    hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
}
Пример #15
0
int main(int argc, char *argv[])
{ int warpSize, pshift;
  hipDeviceProp_t devProp;
  hipDeviceGetProperties(&devProp, 0);
  if(strncmp(devProp.name,"Fiji",1)==0)  {warpSize =64; pshift =6;}
  else {warpSize =32; pshift =5;}
  unsigned int Num_Threads_per_Block      = 512;
  unsigned int Num_Blocks_per_Grid        = 1;
  unsigned int Num_Warps_per_Block        = Num_Threads_per_Block/warpSize;
  unsigned int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;
  unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int));
  unsigned int* device_ballot; 
  HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int)));
  int divergent_count =0;
  for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0;

 
  HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice));

  hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift);


  HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

     if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n";

     else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n";
	  divergent_count++;}
}

if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n");
  return EXIT_SUCCESS;

}
Пример #16
0
void runTests(int64_t numElements)
{
    size_t sizeBytes = numElements * sizeof(int);

    printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0);


    int *C_h, *C_d;
    HIPCHECK(hipMalloc(&C_d, sizeBytes));
    HIPCHECK(hipHostMalloc(&C_h, sizeBytes));


    {
        test (0x01, C_d, C_h, numElements,  syncNone,                             true /*expectMismatch*/);
        test (0x02, C_d, C_h, numElements,  syncNullStream,                       false /*expectMismatch*/);
        test (0x04, C_d, C_h, numElements,  syncOtherStream,                      true /*expectMismatch*/);
        test (0x08, C_d, C_h, numElements,  syncDevice,                           false /*expectMismatch*/);

        // Sending a marker to to null stream may synchronize the otherStream
        //  - other created with hipStreamNonBlocking=0 : synchronization, should match
        //  - other created with hipStreamNonBlocking=1 : no synchronization, may mismatch
        test (0x10, C_d, C_h, numElements,  syncMarkerThenOtherStream,            false /*expectMismatch*/);

        // TODO - review why this test seems flaky
        //test (0x20, C_d, C_h, numElements,  syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/);
    }


    HIPCHECK(hipFree(C_d));
    HIPCHECK(hipHostFree(C_h));
}
Пример #17
0
int main(int argc, char *argv[])
{
	float *A_d, *C_d;
	float *A_h, *C_h;
	size_t N = 1000000;
	size_t Nbytes = N * sizeof(float);

	hipDeviceProp_t props;
	CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/));
	printf ("info: running on device %s\n", props.name);

	printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
	A_h = (float*)malloc(Nbytes);
	CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
	C_h = (float*)malloc(Nbytes);
	CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
	// Fill with Phi + i
    for (size_t i=0; i<N; i++)
	{
		A_h[i] = 1.618f + i;
	}

	printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
	CHECK(hipMalloc(&A_d, Nbytes));
	CHECK(hipMalloc(&C_d, Nbytes));


	printf ("info: copy Host2Device\n");
    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));

	const unsigned blocks = 512;
	const unsigned threadsPerBlock = 256;

	printf ("info: launch 'vector_square' kernel\n");
	hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

	printf ("info: copy Device2Host\n");
    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

	printf ("info: check result\n");
    for (size_t i=0; i<N; i++)  {
		if (C_h[i] != A_h[i] * A_h[i]) {
			CHECK(hipErrorUnknown);
		}
	}
	printf ("PASSED!\n");
}
int main(){
    float *Ad, *A;
    hipHostMalloc((void**)&A, size);
    hipMalloc((void**)&Ad, size);
    assert(hipSuccess == hipMemcpy(Ad, A, size, hipMemcpyHostToDevice));
    assert(hipSuccess == hipMemcpy(A, Ad, size, hipMemcpyDeviceToHost));
    passed();
}
Пример #19
0
void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
                             bool syncBetweenCopies) {
    size_t Nbytes = numElements * sizeof(T);
    size_t eachCopyElements = numElements / numCopies;
    size_t eachCopyBytes = eachCopyElements * sizeof(T);

    printf(
        "------------------------------------------------------------------------------------------"
        "-----\n");
    printf(
        "testing: %s  Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
        __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements,
        eachCopyBytes);

    T* A_d;
    T *A_h1, *A_h2;

    HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault));
    HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault));
    HIPCHECK(hipMalloc(&A_d, Nbytes));

    for (int i = 0; i < numElements; i++) {
        A_h1[i] = 3.14f + static_cast<T>(i);
    }


    // stream=0; // fixme TODO


    for (int i = 0; i < numCopies; i++) {
        HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements],
                                eachCopyBytes, hipMemcpyHostToDevice, stream));
    }

    if (syncBetweenCopies) {
        HIPCHECK(hipDeviceSynchronize());
    }

    for (int i = 0; i < numCopies; i++) {
        HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements],
                                eachCopyBytes, hipMemcpyDeviceToHost, stream));
    }

    HIPCHECK(hipDeviceSynchronize());


    // Verify we copied back all the data correctly:
    for (int i = 0; i < numElements; i++) {
        HIPASSERT(A_h1[i] == A_h2[i]);
    }


    HIPCHECK(hipHostFree(A_h1));
    HIPCHECK(hipHostFree(A_h2));
    HIPCHECK(hipFree(A_d));
}
Пример #20
0
int main(){
    int A=0, *Ad;
    hipMalloc((void**)&Ad, SIZE);
    hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice);
    dim3 dimGrid, dimBlock;
    dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1;
    dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1;
    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
}
Пример #21
0
int main() {
    float *A, *Ad;
    HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault));
    HIPCHECK(hipMalloc((void**)&Ad, SIZE));
    hipStream_t stream;
    HIPCHECK(hipStreamCreate(&stream));
    for (int i = 0; i < SIZE; i++) {
        HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream));
        HIPCHECK(hipDeviceSynchronize());
    }
}
Пример #22
0
int main(){
    float *A, *Ad, *B, *Bd, *C, *Cd;
    A = new float[LEN];
    B = new float[LEN];
    C = new float[LEN];
    for(uint32_t i=0;i<LEN;i++){
        A[i] = i*1.0f;
        B[i] = i*1.0f;
        C[i] = i*1.0f;
    }

    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
}
Пример #23
0
bool run_sincospi() {
    double *A, *Ad, *B, *C, *Bd, *Cd;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
    }
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (B[i] - sin(3.14 * 1.0) < 0.1) {
            passed = 1;
        }
    }
    passed = 0;
    for (int i = 0; i < 512; i++) {
        if (C[i] - cos(3.14 * 1.0) < 0.1) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
void run1(size_t size, hipStream_t stream){
	float *Ah, *Bh, *Cd, *Dd, *Eh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));

	for(int i=0;i<N;i++){
		Ah[i] = 1.0f;
	}

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] == Ah[10] + 1.0f);
}
Пример #25
0
int main(int argc, char *argv[])
{

  int Num_Threads_per_Block      = 1024;
  int Num_Blocks_per_Grid        = 1;
  int Num_Warps_per_Block        = Num_Threads_per_Block/64;
  int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/64;
  
  int * host_any  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int * host_all  = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
  int *device_any; 
  int *device_all;
  HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
  HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
for (int i=0; i<Num_Warps_per_Grid; i++) 
{
	host_any[i] = 0;
	host_all[i] = 0;
}
  HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice));

  hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block);


  HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
  for (int i=0; i<Num_Warps_per_Grid; i++) {

    printf("warp no. %d __any = %d \n",i,host_any[i]);
    printf("warp no. %d __all = %d \n",i,host_all[i]);


}


  return EXIT_SUCCESS;

}
Пример #26
0
bool run_rhypot() {
    double *A, *Ad, *B, *Bd, *C, *Cd;
    A = new double[N];
    B = new double[N];
    C = new double[N];
    double val = 0.0;
    for (int i = 0; i < N; i++) {
        A[i] = 1.0;
        B[i] = 2.0;
    }
    val = 1 / sqrt(1.0 + 4.0);
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipLaunchKernelGGL(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
        if (C[i] - val < 0.000001) {
            passed = 1;
        }
    }

    delete[] A;
    delete[] B;
    delete[] C;
    hipFree(Ad);
    hipFree(Bd);
    hipFree(Cd);

    if (passed == 1) {
        return true;
    }
    assert(passed == 1);
    return false;
}
Пример #27
0
void runTest(int argc, char **argv)
{
    hipDeviceProp_t deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev = 0;

    hipDeviceGetProperties(&deviceProp, dev);

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, "
           "SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    int version = (deviceProp.major * 0x10 + deviceProp.minor);

    unsigned int numThreads = 256;
    unsigned int numBlocks = 64;
    unsigned int numData = 11;
    unsigned int memSize = sizeof(int) * numData;

    //allocate mem for the result on host side
    int *hOData = (int *) malloc(memSize);

    //initialize the memory
    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;

    //To make the AND and XOR tests generate something other than 0...
    hOData[8] = hOData[10] = 0xff;

    // allocate device memory for result
    int *dOData;
    hipMalloc((void **) &dOData, memSize);
    // copy host memory to device to initialize to zero
    hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice);

    // execute the kernel
    hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);

    //Copy result from device to host
    hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost);

    // Compute reference solution
    testResult = computeGold(hOData, numThreads * numBlocks);

    // Cleanup memory
    free(hOData);
    hipFree(dOData);
}
int main() {
    setup();
    int *A, *Ad;
    for(int i=0; i<NUM_SIZE; i++) {
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;
        for(int j=0; j<NUM_ITER; j++) {
            std::cout<<"\r"<<"Iter: "<<j;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
        }
        std::cout<<std::endl;
        hipDeviceSynchronize();
    }
}
Пример #29
0
int main(){
    setup();
    int *A, *Ad;
    for(int i=0;i<NUM_SIZE;i++){
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;

        for(int j=0;j<NUM_ITER;j++){
            std::cout<<"Iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
            hipLaunchKernel(Add, dim3(1), dim3(size[i]/sizeof(int)), 0, 0, Ad);
            hipMemcpy(A, Ad, size[i], hipMemcpyDeviceToHost);
        }

        hipDeviceSynchronize();
    }
}
Пример #30
0
int main() {
    setup();
    int *A, *Ad;
    for (int i = 0; i < NUM_SIZE; i++) {
        std::cout << size[i] << std::endl;
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout << "Malloc success at size: " << size[i] << std::endl;
        clock_t start, end;
        start = clock();
        for (int j = 0; j < NUM_ITER; j++) {
            //            std::cout<<"At iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
        }
        hipDeviceSynchronize();
        end = clock();
        double uS = (double)(end - start) * 1000 / (NUM_ITER * CLOCKS_PER_SEC);
        std::cout << uS << std::endl;
    }
}