int main(int argc,char **argv) { PetscErrorCode ierr; PetscInt nthreads,i; PetscInt *ranks; PetscScalar *values; PetscInitialize(&argc,&argv,(char *)0,help); ierr = PetscThreadCommView(PETSC_COMM_WORLD,0);CHKERRQ(ierr); ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nthreads);CHKERRQ(ierr); ierr = PetscMalloc(nthreads*sizeof(PetscInt),&ranks);CHKERRQ(ierr); ierr = PetscMalloc(nthreads*sizeof(PetscScalar),&values);CHKERRQ(ierr); for(i=0;i < nthreads;i++) { ranks[i] = i; values[i] = i; } ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)kernel_func1,2,ranks,values);CHKERRQ(ierr); ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)kernel_func2,2,ranks,values);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); ierr = PetscFree(ranks);CHKERRQ(ierr); ierr = PetscFree(values);CHKERRQ(ierr); PetscFinalize(); return 0; }
PetscErrorCode PetscStackCreate(void) { PetscErrorCode ierr; ierr = PetscThreadCommRunKernel0(PETSC_COMM_SELF,(PetscThreadKernel)PetscStackCreate_kernel);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_SELF);CHKERRQ(ierr); return 0; }
/* PetscFunctionBegin; so that make rule checkbadPetscFunctionBegin works */ PetscErrorCode PetscStackDestroy(void) { PetscErrorCode ierr; ierr = PetscThreadCommRunKernel0(PETSC_COMM_SELF,(PetscThreadKernel)PetscStackDestroy_kernel);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_SELF);CHKERRQ(ierr); PetscThreadLocalDestroy(petscstack); /* Deletes pthread_key if it was used */ return 0; }
int main(int argc,char *argv[]) { PetscErrorCode ierr; int quantum, checktick(); int BytesPerWord; int j, k; double scalar=3.0, t, times[4][NTIMES]; PetscInitialize(&argc,&argv,0,help); /* --- SETUP --- determine precision and check timing --- */ /*printf(HLINE); printf("STREAM version $Revision: 5.9 $\n"); printf(HLINE); */ BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord); printf(HLINE); #if defined(NO_LONG_LONG) printf("Array size = %d, Offset = %d\n", N, OFFSET); #else printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); #endif printf("Total memory required = %.1f MB.\n",(3.0 * BytesPerWord) * ((double)N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); printf(HLINE); #if !STATIC_ALLOC a = malloc((N+OFFSET)*sizeof(double)); b = malloc((N+OFFSET)*sizeof(double)); c = malloc((N+OFFSET)*sizeof(double)); #endif #if WITH_PTHREADS ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nworkThreads);CHKERRQ(ierr); ierr = PetscMalloc((nworkThreads+1)*sizeof(PetscInt),&trstarts);CHKERRQ(ierr); PetscInt Q,R,nloc; PetscBool S; Q = (N+OFFSET)/nworkThreads; R = (N+OFFSET) - Q*nworkThreads; trstarts[0] = 0; for (j=0; j < nworkThreads; j++) { S = (PetscBool)(j < R); nloc = S ? Q+1 : Q; trstarts[j+1] = trstarts[j]+nloc; } ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Initialize_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); # else for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } #endif /*printf(HLINE);*/ /* Get initial value for system clock. */ if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ else quantum = 1; /* printf("Your clock granularity appears to be less than one microsecond.\n"); */ t = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_2A_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; #endif t = 1.0E6 * (mysecond() - t); /* printf("Each test below will take on the order of %d microseconds.\n", (int)t); printf(" (= %d clock ticks)\n", (int) (t/quantum)); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); */ /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Copy_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Scale_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Add_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Triad_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } printf("Function Rate (MB/s) \n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f \n", label[j], 1.0E-06 * bytes[j]/mintime[j]); } /* printf(HLINE);*/ #if WITH_PTHREADS ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #endif /* --- Check Results --- */ checkSTREAMresults(); /* printf(HLINE);*/ PetscFinalize(); return 0; }
int main(int argc,char **argv) { PetscErrorCode ierr; PetscScalar dot=0.0,v; Vec x,y; PetscInt N=8; PetscScalar one=1.0,two=2.0,alpha=2.0; PetscInitialize(&argc,&argv,(char *)0,help); #if defined(PETSC_THREADCOMM_ACTIVE) ierr = PetscThreadCommView(PETSC_COMM_WORLD,PETSC_VIEWER_STDOUT_WORLD); CHKERRQ(ierr); #endif ierr = PetscOptionsGetInt(PETSC_NULL,"-N",&N,PETSC_NULL); CHKERRQ(ierr); ierr = VecCreate(PETSC_COMM_WORLD,&x); CHKERRQ(ierr); ierr = VecSetSizes(x,PETSC_DECIDE,N); CHKERRQ(ierr); ierr = VecSetFromOptions(x); CHKERRQ(ierr); ierr = VecSet(x,one); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"x = %lf\n",one); CHKERRQ(ierr); ierr = VecCreate(PETSC_COMM_WORLD,&y); CHKERRQ(ierr); ierr = VecSetSizes(y,PETSC_DECIDE,N); CHKERRQ(ierr); ierr = VecSetFromOptions(y); CHKERRQ(ierr); ierr = VecSet(y,two); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"y = %lf\n",two); CHKERRQ(ierr); ierr = VecAXPY(y,alpha,x); CHKERRQ(ierr); v = two+alpha*one; ierr = PetscPrintf(PETSC_COMM_WORLD,"x+%lfy = %lf\n",alpha,v); CHKERRQ(ierr); ierr = VecDot(x,y,&dot); CHKERRQ(ierr); #if defined(PETSC_THREADCOMM_ACTIVE) ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); #endif ierr = PetscPrintf(PETSC_COMM_WORLD,"Dot product %d*(%lf*%lf) is %lf\n",N,one,v,dot); CHKERRQ(ierr); ierr = VecDestroy(&x); CHKERRQ(ierr); ierr = VecDestroy(&y); CHKERRQ(ierr); PetscFinalize(); return 0; }
int main(int argc,char **argv) { PetscErrorCode ierr; PetscInt i,j,k,N=100,**counters,tsize; PetscInitialize(&argc,&argv,(char *)0,help); ierr = PetscThreadCommView(PETSC_COMM_WORLD,PETSC_VIEWER_STDOUT_WORLD); CHKERRQ(ierr); ierr = PetscOptionsGetInt(PETSC_NULL,"-N",&N,PETSC_NULL); CHKERRQ(ierr); ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&tsize); CHKERRQ(ierr); ierr = PetscMalloc(tsize*sizeof(*counters),&counters); CHKERRQ(ierr); ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterInit_kernel,1,counters); CHKERRQ(ierr); for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { /* ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,1,counters);CHKERRQ(ierr); */ ierr = PetscThreadCommRunKernel1(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,counters); CHKERRQ(ierr); } ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Time per kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { #pragma omp parallel num_threads(tsize) { PetscInt trank = omp_get_thread_num(); CounterIncrement_kernel(trank,counters); } } ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"OpenMP inline time per kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { CounterIncrement_kernel(0,counters); } ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per single kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { for (k=0; k<tsize; k++) CounterIncrement_kernel(k,counters); } ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterFree_kernel,1,counters); CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscFree(counters); CHKERRQ(ierr); PetscFinalize(); return 0; }