/** * PPU program entry point. */ int main(int argc, char** argv) { /* Get global memory pointer */ fixedgrid_t* const G = &G_GLOBAL; /* Iterators */ uint32_t k, iter; /* Start wall clock timer */ timer_start(&G->metrics.wallclock); /* Check dimensions */ if(NX % BLOCK_X != 0) { fprintf(stderr, "NX must be a multiple of %d\n", BLOCK_X); exit(1); } if(NY % BLOCK_Y != 0) { fprintf(stderr, "NY must be a multiple of %d\n", BLOCK_Y); exit(1); } if(NZ % BLOCK_Z != 0) { fprintf(stderr, "NZ must be a multiple of %d\n", BLOCK_Z); exit(1); } /* Initialize the model parameters */ init_model(G); /* Add emissions */ process_emissions(G); /* Print startup banner */ print_start_banner(G); /* Store initial concentration */ printf("Writing initial concentration data... "); write_conc(G, 0, 0); printf("done.\n"); printf("\n!!!!FIXME: Report # FPEs\n"); /* BEGIN CALCULATIONS */ for(iter=1, G->time = G->tstart; G->time < G->tend; G->time += G->dt, ++iter) { start_saprc99(G); for(k=0; k<NLOOKAT; k++) { // Copy concentration data to device CU_SAFE_CALL(cuMemcpyHtoD(G->dev_conc, &G->conc(0, 0, 0, MONITOR[k]), NX*NY*NZ*sizeof(real_t))); discretize_all_x(G, G->dt*0.5); discretize_all_y(G, G->dt*0.5); discretize_all_z(G, G->dt); discretize_all_y(G, G->dt*0.5); discretize_all_x(G, G->dt*0.5); // Copy updated concentrations back to host CU_SAFE_CALL(cuMemcpyDtoH((void*)&G->conc(0, 0, 0, MONITOR[k]), G->dev_conc_out, NX*NY*NZ*sizeof(real_t))); } update_model(G); #if WRITE_EACH_ITER == 1 write_conc(G, iter, 0); #endif printf(" After iteration %02d: Model time = %07.2f sec.\n", iter, iter*G->dt); } /* END CALCULATIONS */ /* Store concentration */ #if WRITE_EACH_ITER != 1 write_conc(G, iter-1, 0); #endif /* Show final time */ printf("\nFinal time: %f seconds.\n", (iter-1)*G->dt); timer_stop(&G->metrics.wallclock); /* Write metrics to CSV file */ write_metrics_as_csv(G, "NVidia CUDA"); /* Cleanup and exit */ CU_SAFE_CALL(cuMemFree(G->dev_conc)); CU_SAFE_CALL(cuMemFree(G->dev_wind)); CU_SAFE_CALL(cuMemFree(G->dev_diff)); CU_SAFE_CALL(cuMemFree(G->dev_buff)); CU_SAFE_CALL(cuMemFree(G->dev_conc_out)); CU_SAFE_CALL_NO_SYNC(cuCtxDetach(cu_context_global)); return 0; }
/** * Initializes the model */ void init_model(fixedgrid_t* G) { uint32_t x, y, z, s; /* Chemistry buffer */ real_t chemBuff[NSPEC]; /* Initialize metrics */ metrics_init(&G->metrics, "PPE"); /* Initialize time frame */ /* FIXME: year is ignored */ G->tstart = day2sec(START_DOY) + hour2sec(START_HOUR) + minute2sec(START_MIN); G->tend = day2sec(END_DOY) + hour2sec(END_HOUR) + minute2sec(END_MIN); G->dt = STEP_SIZE; G->time = G->tstart; /* Initialize chemistry and concentration data */ printf("Loading chemistry and concentration data... "); timer_start(&G->metrics.array_init); #if DO_CHEMISTRY == 1 saprc99_Initialize(chemBuff); for(s=0; s<NSPEC; s++) { for(z=0; z<NZ; z++) { for(y=0; y<NY; y++) { for(x=0; x<NX; x++) { G->conc(x, y, z, s) = chemBuff[s]; } } } } #else for(z=0; z<NZ; z++) { for(y=0; y<NY; y++) { for(x=0; x<NX; x++) { G->conc(x, y, z, ind_O3) = O3_INIT; } } } #endif timer_stop(&G->metrics.array_init); printf("done.\n"); /* Initialize wind field */ printf("Loading wind field data... "); timer_start(&G->metrics.array_init); for(z=0; z<NZ; z++) { for(y=0; y<NY; y++) { for(x=0; x<NX; x++) { G->wind_u(x, y, z) = WIND_U_INIT; G->wind_v(x, y, z) = WIND_V_INIT; G->wind_w(x, y, z) = WIND_W_INIT; } } } timer_stop(&G->metrics.array_init); printf("done.\n"); /* Initialize diffusion field */ printf("Loading diffusion field data... "); timer_start(&G->metrics.array_init); for(z=0; z<NZ; z++) { for(y=0; y<NY; y++) { for(x=0; x<NX; x++) { G->diff_h(x, y, z) = DIFF_H_INIT; G->diff_v(x, y, z) = DIFF_V_INIT; } } } timer_stop(&G->metrics.array_init); printf("done.\n"); /* Initialize temperature field */ printf("Loading temperature field data... "); timer_start(&G->metrics.array_init); for(z=0; z<NZ; z++) { for(y=0; y<NY; y++) { for(x=0; x<NX; x++) { G->temp(x, y, z) = TEMP_INIT; } } } timer_stop(&G->metrics.array_init); printf("done.\n"); /* Initialize CUDA kernel and device memory */ printf("Initializing CUDA driver interface... "); //FIXME: Start a timer here CU_SAFE_CALL(init_cuda_driver("data/discretize.cubin")); CU_SAFE_CALL(cuMemAlloc(&G->dev_conc, NZ*NY*NX*sizeof(real_t))); CU_SAFE_CALL(cuMemAlloc(&G->dev_wind, NZ*NY*NX*sizeof(real_t))); CU_SAFE_CALL(cuMemAlloc(&G->dev_diff, NZ*NY*NX*sizeof(real_t))); CU_SAFE_CALL(cuMemAlloc(&G->dev_buff, NZ*NY*NX*sizeof(real_t))); CU_SAFE_CALL(cuMemAlloc(&G->dev_conc_out, NZ*NY*NX*sizeof(real_t))); init_discretization_kernel(G); //FIXME: Stop a timer here printf("done.\n"); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char** argv) { CUcontext cuContext; // initialize CUDA CUfunction pk = NULL; const char cubin_name [] = "pass_kernel.cubin"; const char kernel_name [] = "pass_kernel"; CU_SAFE_CALL(initCuda(cuContext, argv[0], &pk, argc, argv, cubin_name, kernel_name)); printf("initCuda-returned CUfunction:\n"); // cuParamSetx, x=i f v // http://visionexperts.blogspot.com/2010/07/cuda-parameter-alignment.html - check alignment #define ALIGN_UP(offset, alignment) \ (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) size_t offset = 0; // input integers // CU paramset i. for(int i = 0 ; i < NUM_ARG ; i++) { int align = __alignof(int); ALIGN_UP(offset, align); cuParamSeti(pk, offset, i); printf ("offset %d = %d\n", i, offset); offset += sizeof(int); } // return array for updated inputs int size_int = sizeof(int); int size_array = size_int * NUM_ARG; CUdeviceptr d_return_values; cuMemAlloc (&d_return_values, size_array); void* ptr = (void*)(size_t)d_return_values; int align = __alignof(ptr); ALIGN_UP(offset, align); cuParamSetv(pk, offset, &ptr, sizeof(ptr)); printf("return values offset:%d\n", offset); offset += sizeof(ptr); CUdeviceptr d_return_N; cuMemAlloc(&d_return_N, size_int); void* ptrN = (void*)(size_t)d_return_N; int alignN = __alignof(ptrN); ALIGN_UP(offset, alignN); cuParamSetv(pk, offset, &ptrN, sizeof(ptr)); printf("return int offset:%d\n", offset); offset += sizeof(ptrN); // Calling kernel int BLOCK_SIZE_X = NUM_ARG; int BLOCK_SIZE_Y = 1; int BLOCK_SIZE_Z = 1; int GRID_SIZE = 1; cutilDrvSafeCallNoSync(cuFuncSetBlockShape(pk, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z)); printf("paramsetsize:%d\n", offset); CU_SAFE_CALL(cuParamSetSize(pk, offset)); CU_SAFE_CALL(cuLaunchGrid(pk, GRID_SIZE, GRID_SIZE)); int* h_return_values = (int*)malloc(NUM_ARG * sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_values, d_return_values, size_array)); CU_SAFE_CALL(cuMemFree(d_return_values)); for(int i=0;i<NUM_ARG;i++) printf("%dth value = %d\n", i, h_return_values[i]); free(h_return_values); int* h_return_N = (int*)malloc(sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_N, d_return_N, size_int)); CU_SAFE_CALL(cuMemFree(d_return_N)); printf("%d sizeof array\n", *h_return_N); if(cuContext !=NULL) cuCtxDetach(cuContext); }