Beispiel #1
0
int main(int argc,char **argv)
{
  PetscErrorCode ierr;
  PetscInt       nthreads,i;
  PetscInt       *ranks;
  PetscScalar    *values;

  PetscInitialize(&argc,&argv,(char *)0,help);

  ierr = PetscThreadCommView(PETSC_COMM_WORLD,0);CHKERRQ(ierr);
  ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nthreads);CHKERRQ(ierr);

  ierr = PetscMalloc(nthreads*sizeof(PetscInt),&ranks);CHKERRQ(ierr);
  ierr = PetscMalloc(nthreads*sizeof(PetscScalar),&values);CHKERRQ(ierr);

  for(i=0;i < nthreads;i++) {
    ranks[i] = i; values[i] = i;
  }

  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)kernel_func1,2,ranks,values);CHKERRQ(ierr);
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)kernel_func2,2,ranks,values);CHKERRQ(ierr);

  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
  ierr = PetscFree(ranks);CHKERRQ(ierr);
  ierr = PetscFree(values);CHKERRQ(ierr);
  PetscFinalize();
  return 0;
}
Beispiel #2
0
int main(int argc,char *argv[])
{
  PetscErrorCode ierr;
  int            quantum, checktick();
  int            BytesPerWord;
  int            j, k;
  double         scalar=3.0, t, times[4][NTIMES];

  PetscInitialize(&argc,&argv,0,help);
  /* --- SETUP --- determine precision and check timing --- */

  /*printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE); */
  BytesPerWord = sizeof(double);
  printf("This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord);

  printf(HLINE);
#if defined(NO_LONG_LONG)
  printf("Array size = %d, Offset = %d\n", N, OFFSET);
#else
  printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET);
#endif

  printf("Total memory required = %.1f MB.\n",(3.0 * BytesPerWord) * ((double)N / 1048576.0));
  printf("Each test is run %d times, but only\n", NTIMES);
  printf("the *best* time for each is used.\n");

  printf(HLINE);

#if !STATIC_ALLOC
  a = malloc((N+OFFSET)*sizeof(double));
  b = malloc((N+OFFSET)*sizeof(double));
  c = malloc((N+OFFSET)*sizeof(double));
#endif

#if WITH_PTHREADS
  ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nworkThreads);CHKERRQ(ierr);
  ierr = PetscMalloc((nworkThreads+1)*sizeof(PetscInt),&trstarts);CHKERRQ(ierr);
  PetscInt  Q,R,nloc;
  PetscBool S;
  Q           = (N+OFFSET)/nworkThreads;
  R           = (N+OFFSET) - Q*nworkThreads;
  trstarts[0] = 0;
  for (j=0; j < nworkThreads; j++) {
    S             = (PetscBool)(j < R);
    nloc          = S ? Q+1 : Q;
    trstarts[j+1] = trstarts[j]+nloc;
  }
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Initialize_Kernel,1,&scalar);CHKERRQ(ierr);
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
# else
  for (j=0; j<N; j++) {
    a[j] = 1.0;
    b[j] = 2.0;
    c[j] = 0.0;
  }
#endif

  /*printf(HLINE);*/

  /* Get initial value for system clock. */
  if  ((quantum = checktick()) >= 1) ;
  /*      printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
  else quantum = 1;
  /*   printf("Your clock granularity appears to be less than one microsecond.\n"); */

  t = mysecond();

#if WITH_PTHREADS
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_2A_Kernel,0);CHKERRQ(ierr);
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
  for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
#endif
  t = 1.0E6 * (mysecond() - t);

  /*    printf("Each test below will take on the order of %d microseconds.\n", (int)t);
  printf("   (= %d clock ticks)\n", (int) (t/quantum));
  printf("Increase the size of the arrays if this shows that\n");
  printf("you are not getting at least 20 clock ticks per test.\n");

  printf(HLINE);
  */
  /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */

  for (k=0; k<NTIMES; k++) {
    times[0][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Copy_Kernel,0);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) c[j] = a[j];
#endif
    times[0][k] = mysecond() - times[0][k];

    times[1][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Scale_Kernel,1,&scalar);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) b[j] = scalar*c[j];
#endif
    times[1][k] = mysecond() - times[1][k];

    times[2][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Add_Kernel,0);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) c[j] = a[j]+b[j];
#endif
    times[2][k] = mysecond() - times[2][k];

    times[3][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Triad_Kernel,1,&scalar);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
#endif
    times[3][k] = mysecond() - times[3][k];
  }

  /*  --- SUMMARY --- */

  for (k=1; k<NTIMES; k++)     /* note -- skip first iteration */
    for (j=0; j<4; j++) {
      avgtime[j] = avgtime[j] + times[j][k];
      mintime[j] = MIN(mintime[j], times[j][k]);
      maxtime[j] = MAX(maxtime[j], times[j][k]);
    }

  printf("Function      Rate (MB/s) \n");
  for (j=0; j<4; j++) {
    avgtime[j] = avgtime[j]/(double)(NTIMES-1);

    printf("%s%11.4f  \n", label[j], 1.0E-06 * bytes[j]/mintime[j]);
  }
  /* printf(HLINE);*/
#if WITH_PTHREADS
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#endif
  /* --- Check Results --- */
  checkSTREAMresults();
  /*    printf(HLINE);*/
  PetscFinalize();
  return 0;
}
Beispiel #3
0
int main(int argc,char **argv)
{
    PetscErrorCode ierr;
    PetscInt       i,j,k,N=100,**counters,tsize;

    PetscInitialize(&argc,&argv,(char *)0,help);

    ierr = PetscThreadCommView(PETSC_COMM_WORLD,PETSC_VIEWER_STDOUT_WORLD);
    CHKERRQ(ierr);
    ierr = PetscOptionsGetInt(PETSC_NULL,"-N",&N,PETSC_NULL);
    CHKERRQ(ierr);

    ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&tsize);
    CHKERRQ(ierr);
    ierr = PetscMalloc(tsize*sizeof(*counters),&counters);
    CHKERRQ(ierr);
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterInit_kernel,1,counters);
    CHKERRQ(ierr);

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            /*      ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,1,counters);CHKERRQ(ierr); */
            ierr = PetscThreadCommRunKernel1(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,counters);
            CHKERRQ(ierr);
        }
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"Time per kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            #pragma omp parallel num_threads(tsize)
            {
                PetscInt trank = omp_get_thread_num();
                CounterIncrement_kernel(trank,counters);
            }
        }
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"OpenMP inline time per kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            CounterIncrement_kernel(0,counters);
        }
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per single kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            for (k=0; k<tsize; k++) CounterIncrement_kernel(k,counters);
        }
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterFree_kernel,1,counters);
    CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
    CHKERRQ(ierr);
    ierr = PetscFree(counters);
    CHKERRQ(ierr);
    PetscFinalize();
    return 0;
}