void save_soln_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2) { size_t blocksPerGrid; size_t threadsPerBlock; size_t totalThreadNumber; size_t dynamicSharedMemorySize; cl_int errorCode; cl_event event; cl_kernel kernelPointer; int sharedMemoryOffset; double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); blocksPerGrid = 200; threadsPerBlock = threadsPerBlockSize_save_soln; totalThreadNumber = threadsPerBlock * blocksPerGrid; dynamicSharedMemorySize = 0; dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); sharedMemoryOffset = dynamicSharedMemorySize * OP_WARPSIZE; dynamicSharedMemorySize = dynamicSharedMemorySize * threadsPerBlock; kernelPointer = getKernel("save_soln_kernel"); errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat2.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(int ),&sharedMemoryOffset); errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(int ),&set -> size); //errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(size_t ),&dynamicSharedMemorySize); errorCode = errorCode | clSetKernelArg(kernelPointer,4,dynamicSharedMemorySize,NULL); //printf("errorCode after 5: %d\n", errorCode); assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments save_soln"); errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event); assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel save_soln"); errorCode = clFinish(cqCommandQueue); assert_m(errorCode == CL_SUCCESS,"Error completing device command queue"); #ifdef PROFILE unsigned long tqueue, tsubmit, tstart, tend, telapsed; ciErrNum = clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(tqueue), &tqueue, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(tsubmit), &tsubmit, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_END, sizeof(tend), &tend, NULL ); assert_m( ciErrNum == CL_SUCCESS, "error getting profiling info" ); OP_kernels[0].queue_time += (tsubmit - tqueue); OP_kernels[0].wait_time += (tstart - tsubmit); OP_kernels[0].execution_time += (tend - tstart); //printf("%20lu\n%20lu\n%20lu\n%20lu\n\n", tqueue, tsubmit, tstart, tend); //printf("queue: %8.4f\nwait:%8.4f\nexec: %8.4f\n\n", OP_kernels[0].queue_time * 1.0e-9, OP_kernels[0].wait_time * 1.0e-9, OP_kernels[0].execution_time * 1.0e-9 ); #endif // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = userSubroutine; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * opDat1.size; OP_kernels[0].transfer += (float)set->size * opDat2.size; }
void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ) { if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_save_soln( (float *) arg0.data, (float *) arg1.data, start, finish ); } // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int niter; double rms; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); char file[] = "new_grid.h5"; // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set edges = op_decl_set_hdf5(file, "edges"); op_set bedges = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pedge = op_decl_map_hdf5(edges, nodes, 2, file, "pedge"); op_map pecell = op_decl_map_hdf5(edges, cells,2, file, "pecell"); op_map pbedge = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge"); op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell"); op_map pcell = op_decl_map_hdf5(cells, nodes,4, file, "pcell"); op_map m_test = op_decl_map_hdf5(cells, nodes,4, file, "m_test"); if (m_test == NULL) printf("m_test not found\n"); op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int" ,file,"p_bound"); op_dat p_x = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x"); op_dat p_q = op_decl_dat_hdf5(cells ,4,"double",file,"p_q"); op_dat p_qold = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold"); op_dat p_adt = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt"); op_dat p_res = op_decl_dat_hdf5(cells ,4,"double",file,"p_res"); op_dat p_test = op_decl_dat_hdf5(cells ,4,"double",file,"p_test"); if (p_test == NULL) printf("p_test not found\n"); op_get_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid.h5"); op_get_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid.h5"); op_get_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid.h5"); op_get_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid.h5"); op_get_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid.h5"); op_get_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid.h5"); op_get_const_hdf5("qinf", 4, "double", (char *)&qinf, "new_grid.h5"); op_decl_const2("gam",1,"double",&gam); op_decl_const2("gm1",1,"double",&gm1); op_decl_const2("cfl",1,"double",&cfl); op_decl_const2("eps",1,"double",&eps); op_decl_const2("mach",1,"double",&mach); op_decl_const2("alpha",1,"double",&alpha); op_decl_const2("qinf",4,"double",qinf); op_diagnostic_output(); //write back original data just to compare you read the file correctly //do an h5diff between new_grid_out.h5 and new_grid.h5 to //compare two hdf5 files op_dump_to_hdf5("new_grid_out.h5"); op_write_const_hdf5("gam",1,"double",(char *)&gam, "new_grid_out.h5"); op_write_const_hdf5("gm1",1,"double",(char *)&gm1, "new_grid_out.h5"); op_write_const_hdf5("cfl",1,"double",(char *)&cfl, "new_grid_out.h5"); op_write_const_hdf5("eps",1,"double",(char *)&eps, "new_grid_out.h5"); op_write_const_hdf5("mach",1,"double",(char *)&mach, "new_grid_out.h5"); op_write_const_hdf5("alpha",1,"double",(char *)&alpha, "new_grid_out.h5"); op_write_const_hdf5("qinf",4,"double",(char *)qinf, "new_grid_out.h5"); //trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x); //op_partition("PARMETIS", "KWAY", edges, pecell, p_x); int g_ncell = op_get_size(cells); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for(int iter=1; iter<=niter; iter++) { // save old flow solution op_par_loop_save_soln("save_soln",cells, op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop_adt_calc("adt_calc",cells, op_arg_dat(p_x,0,pcell,2,"double",OP_READ), op_arg_dat(p_x,1,pcell,2,"double",OP_READ), op_arg_dat(p_x,2,pcell,2,"double",OP_READ), op_arg_dat(p_x,3,pcell,2,"double",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_WRITE)); // calculate flux residual op_par_loop_res_calc("res_calc",edges, op_arg_dat(p_x,0,pedge,2,"double",OP_READ), op_arg_dat(p_x,1,pedge,2,"double",OP_READ), op_arg_dat(p_q,0,pecell,4,"double",OP_READ), op_arg_dat(p_q,1,pecell,4,"double",OP_READ), op_arg_dat(p_adt,0,pecell,1,"double",OP_READ), op_arg_dat(p_adt,1,pecell,1,"double",OP_READ), op_arg_dat(p_res,0,pecell,4,"double",OP_INC), op_arg_dat(p_res,1,pecell,4,"double",OP_INC)); op_par_loop_bres_calc("bres_calc",bedges, op_arg_dat(p_x,0,pbedge,2,"double",OP_READ), op_arg_dat(p_x,1,pbedge,2,"double",OP_READ), op_arg_dat(p_q,0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt,0,pbecell,1,"double",OP_READ), op_arg_dat(p_res,0,pbecell,4,"double",OP_INC), op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ)); // update flow field rms = 0.0; op_par_loop_update("update",cells, op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"double",OP_WRITE), op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW), op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&rms,1,"double",OP_INC)); } // print iteration history rms = sqrt(rms/(double)g_ncell); if (iter%100 == 0) op_printf(" %d %10.5e \n",iter,rms); } op_timers(&cpu_t2, &wall_t2); //write given op_dat's indicated segment of data to a memory block in the order it was originally //arranged (i.e. before partitioning and reordering) double* q = (double *)op_malloc(sizeof(double)*op_get_size(cells)*4); op_fetch_data_idx(p_q, q, 0, op_get_size(cells)-1); free(q); //write given op_dat's data to hdf5 file in the order it was originally arranged (i.e. before partitioning and reordering) op_fetch_data_hdf5_file(p_q, "file_name.h5"); //printf("Root process = %d\n",op_is_root()); //output the result dat array to files //op_dump_to_hdf5("new_grid_out.h5"); //writes data as it is held on each process (under MPI) //compress using // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5 op_timing_output(); op_printf("Max total runtime = %f\n",wall_t2-wall_t1); op_exit(); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; int nnode,ncell,nedge,nbedge,niter; double rms; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in grid op_printf("reading in grid \n"); FILE *fp; if ( (fp = fopen("./new_grid.dat","r")) == NULL) { op_printf("can't open file new_grid.dat\n"); exit(-1); } if (fscanf(fp,"%d %d %d %d \n",&nnode, &ncell, &nedge, &nbedge) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } cell = (int *) malloc(4*ncell*sizeof(int)); edge = (int *) malloc(2*nedge*sizeof(int)); ecell = (int *) malloc(2*nedge*sizeof(int)); bedge = (int *) malloc(2*nbedge*sizeof(int)); becell = (int *) malloc( nbedge*sizeof(int)); bound = (int *) malloc( nbedge*sizeof(int)); x = (double *) malloc(2*nnode*sizeof(double)); q = (double *) malloc(4*ncell*sizeof(double)); qold = (double *) malloc(4*ncell*sizeof(double)); res = (double *) malloc(4*ncell*sizeof(double)); adt = (double *) malloc( ncell*sizeof(double)); for (int n=0; n<nnode; n++) { if (fscanf(fp,"%lf %lf \n",&x[2*n], &x[2*n+1]) != 2) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<ncell; n++) { if (fscanf(fp,"%d %d %d %d \n",&cell[4*n ], &cell[4*n+1], &cell[4*n+2], &cell[4*n+3]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<nedge; n++) { if (fscanf(fp,"%d %d %d %d \n",&edge[2*n], &edge[2*n+1], &ecell[2*n],&ecell[2*n+1]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<nbedge; n++) { if (fscanf(fp,"%d %d %d %d \n",&bedge[2*n],&bedge[2*n+1], &becell[n], &bound[n]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } fclose(fp); // set constants and initialise flow field and residual op_printf("initialising flow field \n"); gam = 1.4f; gm1 = gam - 1.0f; cfl = 0.9f; eps = 0.05f; double mach = 0.4f; double alpha = 3.0f*atan(1.0f)/45.0f; double p = 1.0f; double r = 1.0f; double u = sqrt(gam*p/r)*mach; double e = p/(r*gm1) + 0.5f*u*u; qinf[0] = r; qinf[1] = r*u; qinf[2] = 0.0f; qinf[3] = r*e; for (int n=0; n<ncell; n++) { for (int m=0; m<4; m++) { q[4*n+m] = qinf[m]; res[4*n+m] = 0.0f; } } // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set(nnode, "nodes"); op_set edges = op_decl_set(nedge, "edges"); op_set bedges = op_decl_set(nbedge, "bedges"); op_set cells = op_decl_set(ncell, "cells"); op_map pedge = op_decl_map(edges, nodes,2,edge, "pedge"); op_map pecell = op_decl_map(edges, cells,2,ecell, "pecell"); op_map pbedge = op_decl_map(bedges,nodes,2,bedge, "pbedge"); op_map pbecell = op_decl_map(bedges,cells,1,becell,"pbecell"); op_map pcell = op_decl_map(cells, nodes,4,cell, "pcell"); op_dat p_bound = op_decl_dat(bedges,1,"int" ,bound,"p_bound"); op_dat p_x = op_decl_dat(nodes ,2,"double",x ,"p_x"); op_dat p_q = op_decl_dat(cells ,4,"double",q ,"p_q"); //op_dat p_qold = op_decl_dat(cells ,4,"double",qold ,"p_qold"); //op_dat p_adt = op_decl_dat(cells ,1,"double",adt ,"p_adt"); //op_dat p_res = op_decl_dat(cells ,4,"double",res ,"p_res"); // p_res, p_adt and p_qold now declared as a temp op_dats during // the execution of the time-marching loop op_decl_const2("gam",1,"double",&gam); op_decl_const2("gm1",1,"double",&gm1); op_decl_const2("cfl",1,"double",&cfl); op_decl_const2("eps",1,"double",&eps); op_decl_const2("mach",1,"double",&mach); op_decl_const2("alpha",1,"double",&alpha); op_decl_const2("qinf",4,"double",qinf); op_diagnostic_output(); double g_ncell = op_get_size(cells); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for(int iter=1; iter<=niter; iter++) { double* tmp_elem = NULL; op_dat p_res = op_decl_dat_temp(cells ,4,"double",tmp_elem,"p_res"); op_dat p_adt = op_decl_dat_temp(cells ,1,"double",tmp_elem,"p_adt"); op_dat p_qold = op_decl_dat_temp(cells ,4,"double",qold ,"p_qold"); // save old flow solution op_par_loop_save_soln("save_soln",cells, op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop_adt_calc("adt_calc",cells, op_arg_dat(p_x,0,pcell,2,"double",OP_READ), op_arg_dat(p_x,1,pcell,2,"double",OP_READ), op_arg_dat(p_x,2,pcell,2,"double",OP_READ), op_arg_dat(p_x,3,pcell,2,"double",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_WRITE)); // calculate flux residual op_par_loop_res_calc("res_calc",edges, op_arg_dat(p_x,0,pedge,2,"double",OP_READ), op_arg_dat(p_x,1,pedge,2,"double",OP_READ), op_arg_dat(p_q,0,pecell,4,"double",OP_READ), op_arg_dat(p_q,1,pecell,4,"double",OP_READ), op_arg_dat(p_adt,0,pecell,1,"double",OP_READ), op_arg_dat(p_adt,1,pecell,1,"double",OP_READ), op_arg_dat(p_res,0,pecell,4,"double",OP_INC), op_arg_dat(p_res,1,pecell,4,"double",OP_INC)); op_par_loop_bres_calc("bres_calc",bedges, op_arg_dat(p_x,0,pbedge,2,"double",OP_READ), op_arg_dat(p_x,1,pbedge,2,"double",OP_READ), op_arg_dat(p_q,0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt,0,pbecell,1,"double",OP_READ), op_arg_dat(p_res,0,pbecell,4,"double",OP_INC), op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ)); // update flow field rms = 0.0; op_par_loop_update("update",cells, op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"double",OP_WRITE), op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW), op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&rms,1,"double",OP_INC)); } // print iteration history rms = sqrt(rms/(double)g_ncell ); if (iter%100 == 0) op_printf(" %d %10.5e \n",iter,rms); if (iter%1000 == 0 && g_ncell == 720000){ //defailt mesh -- for validation testing //op_printf(" %d %3.16f \n",iter,rms); double diff=fabs((100.0*(rms/0.0001060114637578))-100.0); op_printf("\n\nTest problem with %d cells is within %3.15E %% of the expected solution\n",720000, diff); if(diff < 0.00001) { op_printf("This test is considered PASSED\n"); } else { op_printf("This test is considered FAILED\n"); } } if (op_free_dat_temp(p_res) < 0) op_printf("Error: temporary op_dat %s cannot be removed\n",p_res->name); if (op_free_dat_temp(p_adt) < 0) op_printf("Error: temporary op_dat %s cannot be removed\n",p_adt->name); if (op_free_dat_temp(p_qold) < 0) op_printf("Error: temporary op_dat %s cannot be removed\n",p_qold->name); } op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_printf("Max total runtime = %f\n",wall_t2-wall_t1); op_exit(); free(cell); free(edge); free(ecell); free(bedge); free(becell); free(bound); free(x); free(q); free(qold); free(res); free(adt); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int *becell, *ecell, *bound, *bedge, *edge, *cell; float *x, *q, *qold, *adt, *res; int nnode,ncell,nedge,nbedge,niter; float rms; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in grid op_printf("reading in grid \n"); FILE *fp; if ( (fp = fopen("./new_grid.dat","r")) == NULL) { op_printf("can't open file new_grid.dat\n"); exit(-1); } if (fscanf(fp,"%d %d %d %d \n",&nnode, &ncell, &nedge, &nbedge) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } cell = (int *) malloc(4*ncell*sizeof(int)); edge = (int *) malloc(2*nedge*sizeof(int)); ecell = (int *) malloc(2*nedge*sizeof(int)); bedge = (int *) malloc(2*nbedge*sizeof(int)); becell = (int *) malloc( nbedge*sizeof(int)); bound = (int *) malloc( nbedge*sizeof(int)); x = (float *) malloc(2*nnode*sizeof(float)); q = (float *) malloc(4*ncell*sizeof(float)); qold = (float *) malloc(4*ncell*sizeof(float)); res = (float *) malloc(4*ncell*sizeof(float)); adt = (float *) malloc( ncell*sizeof(float)); for (int n=0; n<nnode; n++) { if (fscanf(fp,"%f %f \n",&x[2*n], &x[2*n+1]) != 2) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<ncell; n++) { if (fscanf(fp,"%d %d %d %d \n",&cell[4*n ], &cell[4*n+1], &cell[4*n+2], &cell[4*n+3]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<nedge; n++) { if (fscanf(fp,"%d %d %d %d \n",&edge[2*n], &edge[2*n+1], &ecell[2*n],&ecell[2*n+1]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<nbedge; n++) { if (fscanf(fp,"%d %d %d %d \n",&bedge[2*n],&bedge[2*n+1], &becell[n], &bound[n]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } fclose(fp); // set constants and initialise flow field and residual op_printf("initialising flow field \n"); gam = 1.4f; gm1 = gam - 1.0f; cfl = 0.9f; eps = 0.05f; float mach = 0.4f; float alpha = 3.0f*atan(1.0f)/45.0f; float p = 1.0f; float r = 1.0f; float u = sqrt(gam*p/r)*mach; float e = p/(r*gm1) + 0.5f*u*u; qinf[0] = r; qinf[1] = r*u; qinf[2] = 0.0f; qinf[3] = r*e; for (int n=0; n<ncell; n++) { for (int m=0; m<4; m++) { q[4*n+m] = qinf[m]; res[4*n+m] = 0.0f; } } // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set(nnode, "nodes"); op_set edges = op_decl_set(nedge, "edges"); op_set bedges = op_decl_set(nbedge, "bedges"); op_set cells = op_decl_set(ncell, "cells"); op_map pedge = op_decl_map(edges, nodes,2,edge, "pedge"); op_map pecell = op_decl_map(edges, cells,2,ecell, "pecell"); op_map pbedge = op_decl_map(bedges,nodes,2,bedge, "pbedge"); op_map pbecell = op_decl_map(bedges,cells,1,becell,"pbecell"); op_map pcell = op_decl_map(cells, nodes,4,cell, "pcell"); op_dat p_bound = op_decl_dat(bedges,1,"int" ,bound,"p_bound"); op_dat p_x = op_decl_dat(nodes ,2,"float",x ,"p_x"); op_dat p_q = op_decl_dat(cells ,4,"float",q ,"p_q"); op_dat p_qold = op_decl_dat(cells ,4,"float",qold ,"p_qold"); op_dat p_adt = op_decl_dat(cells ,1,"float",adt ,"p_adt"); op_dat p_res = op_decl_dat(cells ,4,"float",res ,"p_res"); op_decl_const2("gam",1,"float",&gam); op_decl_const2("gm1",1,"float",&gm1); op_decl_const2("cfl",1,"float",&cfl); op_decl_const2("eps",1,"float",&eps); op_decl_const2("mach",1,"float",&mach); op_decl_const2("alpha",1,"float",&alpha); op_decl_const2("qinf",4,"float",qinf); op_diagnostic_output(); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for(int iter=1; iter<=niter; iter++) { // save old flow solution op_par_loop_save_soln("save_soln",cells, op_arg_dat(p_q,-1,OP_ID,4,"float",OP_READ), op_arg_dat(p_qold,-1,OP_ID,4,"float",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop_adt_calc("adt_calc",cells, op_arg_dat(p_x,0,pcell,2,"float",OP_READ), op_arg_dat(p_x,1,pcell,2,"float",OP_READ), op_arg_dat(p_x,2,pcell,2,"float",OP_READ), op_arg_dat(p_x,3,pcell,2,"float",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"float",OP_READ), op_arg_dat(p_adt,-1,OP_ID,1,"float",OP_WRITE)); // calculate flux residual op_par_loop_res_calc("res_calc",edges, op_arg_dat(p_x,0,pedge,2,"float",OP_READ), op_arg_dat(p_x,1,pedge,2,"float",OP_READ), op_arg_dat(p_q,0,pecell,4,"float",OP_READ), op_arg_dat(p_q,1,pecell,4,"float",OP_READ), op_arg_dat(p_adt,0,pecell,1,"float",OP_READ), op_arg_dat(p_adt,1,pecell,1,"float",OP_READ), op_arg_dat(p_res,0,pecell,4,"float",OP_INC), op_arg_dat(p_res,1,pecell,4,"float",OP_INC)); op_par_loop_bres_calc("bres_calc",bedges, op_arg_dat(p_x,0,pbedge,2,"float",OP_READ), op_arg_dat(p_x,1,pbedge,2,"float",OP_READ), op_arg_dat(p_q,0,pbecell,4,"float",OP_READ), op_arg_dat(p_adt,0,pbecell,1,"float",OP_READ), op_arg_dat(p_res,0,pbecell,4,"float",OP_INC), op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ)); // update flow field rms = 0.0; op_par_loop_update("update",cells, op_arg_dat(p_qold,-1,OP_ID,4,"float",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"float",OP_WRITE), op_arg_dat(p_res,-1,OP_ID,4,"float",OP_RW), op_arg_dat(p_adt,-1,OP_ID,1,"float",OP_READ), op_arg_gbl(&rms,1,"float",OP_INC)); } // print iteration history rms = sqrt(rms/(float) op_get_size(cells)); if (iter%100 == 0) op_printf(" %d %10.5e \n",iter,rms); } op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_printf("Max total runtime = \n%f\n",wall_t2-wall_t1); op_exit(); free(cell); free(edge); free(ecell); free(bedge); free(becell); free(bound); free(x); free(q); free(qold); free(res); free(adt); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; int nnode,ncell,nedge,nbedge; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // read in airfoil grid op_printf("reading in data \n"); FILE *fp; if ( (fp = fopen("./new_grid.dat","r")) == NULL) { op_printf("can't open file new_grid.dat\n"); exit(-1); } if (fscanf(fp,"%d %d %d %d \n",&nnode, &ncell, &nedge, &nbedge) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } cell = (int *) malloc(4*ncell*sizeof(int)); edge = (int *) malloc(2*nedge*sizeof(int)); ecell = (int *) malloc(2*nedge*sizeof(int)); bedge = (int *) malloc(2*nbedge*sizeof(int)); becell = (int *) malloc( nbedge*sizeof(int)); bound = (int *) malloc( nbedge*sizeof(int)); x = (double *) malloc(2*nnode*sizeof(double)); q = (double *) malloc(4*ncell*sizeof(double)); qold = (double *) malloc(4*ncell*sizeof(double)); res = (double *) malloc(4*ncell*sizeof(double)); adt = (double *) malloc( ncell*sizeof(double)); for (int n=0; n<nnode; n++) { if (fscanf(fp,"%lf %lf \n",&x[2*n], &x[2*n+1]) != 2) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<ncell; n++) { if (fscanf(fp,"%d %d %d %d \n",&cell[4*n ], &cell[4*n+1], &cell[4*n+2], &cell[4*n+3]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<nedge; n++) { if (fscanf(fp,"%d %d %d %d \n",&edge[2*n], &edge[2*n+1], &ecell[2*n],&ecell[2*n+1]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<nbedge; n++) { if (fscanf(fp,"%d %d %d %d \n",&bedge[2*n],&bedge[2*n+1], &becell[n], &bound[n]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } fclose(fp); // declare sets, pointers, datasets op_set edges = op_decl_set(nedge, "edges"); op_set cells = op_decl_set(ncell, "cells"); op_map pecell = op_decl_map(edges, cells,2,ecell, "pecell"); op_dat p_res = op_decl_dat(cells ,4,"double",res ,"p_res"); int count; op_diagnostic_output(); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); //indirect reduction count = 0; op_par_loop_res_calc("res_calc",edges, op_arg_dat(p_res,0,pecell,4,"double",OP_INC), op_arg_gbl(&count,1,"int",OP_INC)); op_printf("number of edges:: %d should be: %d \n",count,nedge); if (count != nedge) op_printf("indirect reduction FAILED\n"); else op_printf("indirect reduction PASSED\n"); //direct reduction count = 0; op_par_loop_update("update",cells, op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW), op_arg_gbl(&count,1,"int",OP_INC)); op_printf("number of cells: %d should be: %d \n",count,ncell); if (count != ncell) op_printf("direct reduction FAILED\n"); else op_printf("direct reduction PASSED\n"); op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_exit(); free(cell); free(edge); free(ecell); free(bedge); free(becell); free(bound); free(x); free(q); free(qold); free(res); free(adt); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); //MPI for user I/O int my_rank; int comm_size; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; int *pp; double *A, *r, *u, *du; int nnode, nedge; /**------------------------BEGIN I/O and PARTITIONING ---------------------**/ int g_nnode, g_nedge, g_n, g_e; g_nnode = (NN-1)*(NN-1); g_nedge = (NN-1)*(NN-1) + 4*(NN-1)*(NN-2); int *g_pp = 0; double *g_A = 0, *g_r = 0, *g_u = 0, *g_du = 0; op_printf("Global number of nodes, edges = %d, %d\n",g_nnode,g_nedge); if(my_rank == MPI_ROOT) { g_pp = (int *)malloc(sizeof(int)*2*g_nedge); g_A = (double *)malloc(sizeof(double)*g_nedge); g_r = (double *)malloc(sizeof(double)*g_nnode); g_u = (double *)malloc(sizeof(double)*g_nnode); g_du = (double *)malloc(sizeof(double)*g_nnode); // create matrix and r.h.s., and set coordinates needed for renumbering / partitioning g_e = 0; for (int i=1; i<NN; i++) { for (int j=1; j<NN; j++) { g_n = i-1 + (j-1)*(NN-1); g_r[g_n] = 0.0f; g_u[g_n] = 0.0f; g_du[g_n] = 0.0f; g_pp[2*g_e] = g_n; g_pp[2*g_e+1] = g_n; g_A[g_e] = -1.0f; g_e++; for (int pass=0; pass<4; pass++) { int i2 = i; int j2 = j; if (pass==0) i2 += -1; if (pass==1) i2 += 1; if (pass==2) j2 += -1; if (pass==3) j2 += 1; if ( (i2==0) || (i2==NN) || (j2==0) || (j2==NN) ) { g_r[g_n] += 0.25f; } else { g_pp[2*g_e] = g_n; g_pp[2*g_e+1] = i2-1 + (j2-1)*(NN-1); g_A[g_e] = 0.25f; g_e++; } } } } } /* Compute local sizes */ nnode = compute_local_size (g_nnode, comm_size, my_rank); nedge = compute_local_size (g_nedge, comm_size, my_rank); op_printf("Number of nodes, edges on process %d = %d, %d\n" ,my_rank,nnode,nedge); /*Allocate memory to hold local sets, mapping tables and data*/ pp = (int *)malloc(2*sizeof(int)*nedge); A = (double *) malloc(nedge*sizeof(double)); r = (double *) malloc(nnode*sizeof(double)); u = (double *) malloc(nnode*sizeof(double)); du = (double *) malloc(nnode*sizeof(double)); /* scatter sets, mappings and data on sets*/ scatter_int_array(g_pp, pp, comm_size, g_nedge,nedge, 2); scatter_double_array(g_A, A, comm_size, g_nedge,nedge, 1); scatter_double_array(g_r, r, comm_size, g_nnode,nnode, 1); scatter_double_array(g_u, u, comm_size, g_nnode,nnode, 1); scatter_double_array(g_du, du, comm_size, g_nnode,nnode, 1); /*Freeing memory allocated to gloabal arrays on rank 0 after scattering to all processes*/ if(my_rank == MPI_ROOT) { free(g_pp); free(g_A); free(g_r); free(g_u); free(g_du); } /**------------------------END I/O and PARTITIONING ---------------------**/ // declare sets, pointers, and datasets op_set nodes = op_decl_set(nnode,"nodes"); op_set edges = op_decl_set(nedge,"edges"); op_map ppedge = op_decl_map(edges,nodes,2,pp, "ppedge"); op_dat p_A = op_decl_dat(edges,1,"double", A, "p_A" ); op_dat p_r = op_decl_dat(nodes,1,"double", r, "p_r" ); op_dat p_u = op_decl_dat(nodes,1,"double", u, "p_u" ); op_dat p_du = op_decl_dat(nodes,1,"double", du,"p_du"); alpha = 1.0f; op_decl_const(1,"double",&alpha); op_diagnostic_output(); //trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", NULL, NULL, NULL); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main iteration loop double u_sum, u_max, beta = 1.0f; for (int iter=0; iter<NITER; iter++) { op_par_loop(res,"res", edges, op_arg_dat(p_A, -1,OP_ID, 1,"double", OP_READ), op_arg_dat(p_u, 1,ppedge, 1,"double", OP_READ), op_arg_dat(p_du, 0,ppedge, 1,"double", OP_INC), op_arg_gbl(&beta, 1,"double", OP_READ)); u_sum = 0.0f; u_max = 0.0f; op_par_loop(update,"update", nodes, op_arg_dat(p_r, -1,OP_ID, 1,"double",OP_READ), op_arg_dat(p_du, -1,OP_ID, 1,"double",OP_RW), op_arg_dat(p_u, -1,OP_ID, 1,"double",OP_INC), op_arg_gbl(&u_sum,1,"double",OP_INC), op_arg_gbl(&u_max,1,"double",OP_MAX)); op_printf("\n u max/rms = %f %f \n\n",u_max, sqrt(u_sum/g_nnode)); } op_timers(&cpu_t2, &wall_t2); //get results data array op_dat temp = op_mpi_get_data(p_u); //output the result dat array to files print_dat_tofile(temp, "out_grid.dat"); //ASCI //print_dat_tobinfile(temp, "out_grid.bin"); //Binary //print each mpi process's timing info for each kernel op_timing_output(); //print total time for niter interations op_printf("Max total runtime = %f\n",wall_t2-wall_t1); op_exit(); }
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5}; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; int sent[6] = {0,0,0,0,0,0}; if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[0] = exchange_halo(args[i]); if(sent[0] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc \n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_adt_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], (double *)arg4.data, (double *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; }
int main(int argc, char **argv) { // OP initialisation op_init(argc, argv, 5); // timer double cpu_t1, cpu_t2, wall_t1, wall_t2; int nnode, nedge, n, e; nnode = (NN - 1) * (NN - 1); nedge = (NN - 1) * (NN - 1) + 4 * (NN - 1) * (NN - 2); int *pp = (int *)malloc(sizeof(int) * 2 * nedge); double *A = (double *)malloc(sizeof(double) * nedge); double *r = (double *)malloc(sizeof(double) * nnode); double *u = (double *)malloc(sizeof(double) * nnode); double *du = (double *)malloc(sizeof(double) * nnode); // create matrix and r.h.s., and set coordinates needed for renumbering / // partitioning e = 0; for (int i = 1; i < NN; i++) { for (int j = 1; j < NN; j++) { n = i - 1 + (j - 1) * (NN - 1); r[n] = 0.0f; u[n] = 0.0f; du[n] = 0.0f; pp[2 * e] = n; pp[2 * e + 1] = n; A[e] = -1.0f; e++; for (int pass = 0; pass < 4; pass++) { int i2 = i; int j2 = j; if (pass == 0) i2 += -1; if (pass == 1) i2 += 1; if (pass == 2) j2 += -1; if (pass == 3) j2 += 1; if ((i2 == 0) || (i2 == NN) || (j2 == 0) || (j2 == NN)) { r[n] += 0.25f; } else { pp[2 * e] = n; pp[2 * e + 1] = i2 - 1 + (j2 - 1) * (NN - 1); A[e] = 0.25f; e++; } } } } // declare sets, pointers, and datasets op_set nodes = op_decl_set(nnode, "nodes"); op_set edges = op_decl_set(nedge, "edges"); op_map ppedge = op_decl_map(edges, nodes, 2, pp, "ppedge"); op_dat p_A = op_decl_dat(edges, 1, "double", A, "p_A"); op_dat p_r = op_decl_dat(nodes, 1, "double", r, "p_r"); op_dat p_u = op_decl_dat(nodes, 1, "double", u, "p_u"); op_dat p_du = op_decl_dat(nodes, 1, "double", du, "p_du"); alpha = 1.0f; op_decl_const2("alpha", 1, "double", &alpha); op_diagnostic_output(); // initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main iteration loop double u_sum, u_max, beta = 1.0f; for (int iter = 0; iter < NITER; iter++) { op_par_loop_res("res", edges, op_arg_dat(p_A, -1, OP_ID, 1, "double", OP_READ), op_arg_dat(p_u, 1, ppedge, 1, "double", OP_READ), op_arg_dat(p_du, 0, ppedge, 1, "double", OP_INC), op_arg_gbl(&beta, 1, "double", OP_READ)); u_sum = 0.0f; u_max = 0.0f; op_par_loop_update("update", nodes, op_arg_dat(p_r, -1, OP_ID, 1, "double", OP_READ), op_arg_dat(p_du, -1, OP_ID, 1, "double", OP_RW), op_arg_dat(p_u, -1, OP_ID, 1, "double", OP_INC), op_arg_gbl(&u_sum, 1, "double", OP_INC), op_arg_gbl(&u_max, 1, "double", OP_MAX)); op_printf("\n u max/rms = %f %f \n\n", u_max, sqrt(u_sum / nnode)); } op_timers(&cpu_t2, &wall_t2); // print out results op_printf("\n Results after %d iterations:\n\n", NITER); op_fetch_data(p_u, u); for (int pass = 0; pass < 1; pass++) { for (int j = NN - 1; j > 0; j--) { for (int i = 1; i < NN; i++) { if (pass == 0) op_printf(" %7.4f", u[i - 1 + (j - 1) * (NN - 1)]); else if (pass == 1) op_printf(" %7.4f", du[i - 1 + (j - 1) * (NN - 1)]); else if (pass == 2) op_printf(" %7.4f", r[i - 1 + (j - 1) * (NN - 1)]); } op_printf("\n"); } op_printf("\n"); } op_timing_output(); // print total time for niter interations op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); int result = check_result<double>(u, NN, TOLERANCE); op_exit(); free(pp); free(A); free(u); free(du); free(r); return result; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int ninds = 0; int nargs = 5; op_arg args[5] = {arg0,arg1,arg2,arg3,arg4}; double *arg4h = (double *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (double *) arg0.data, (double *) arg1.data, (double *) arg2.data, (double *) arg3.data, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64]; //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_GBL) global_reduce(&args[i]); // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (double)set->size * arg0.size; OP_kernels[4].transfer += (double)set->size * arg1.size; OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (double)set->size * arg3.size; }
void bres_calc_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2,op_arg opDat3,op_arg opDat4,op_arg opDat5,op_arg opDat6) { size_t blocksPerGrid; size_t threadsPerBlock; size_t totalThreadNumber; size_t dynamicSharedMemorySize; cl_int errorCode; cl_event event; cl_kernel kernelPointer; int i3; op_arg opDatArray[6]; int indirectionDescriptorArray[6]; op_plan *planRet; int blockOffset; opDatArray[0] = opDat1; opDatArray[1] = opDat2; opDatArray[2] = opDat3; opDatArray[3] = opDat4; opDatArray[4] = opDat5; opDatArray[5] = opDat6; indirectionDescriptorArray[0] = 0; indirectionDescriptorArray[1] = 0; indirectionDescriptorArray[2] = 1; indirectionDescriptorArray[3] = 2; indirectionDescriptorArray[4] = 3; indirectionDescriptorArray[5] = -1; planRet = op_plan_get(userSubroutine,set,setPartitionSize_bres_calc,6,opDatArray,4,indirectionDescriptorArray); cl_mem gm1_d; gm1_d = op_allocate_constant(&gm1,sizeof(float )); cl_mem qinf_d; qinf_d = op_allocate_constant(&qinf,4 * sizeof(float)); cl_mem eps_d; eps_d = op_allocate_constant(&eps,sizeof(float )); blockOffset = 0; double cpu_t1; double cpu_t2; double wall_t1; op_timers(&cpu_t1, &wall_t1); double wall_t2; for (i3 = 0; i3 < planRet -> ncolors; ++i3) { blocksPerGrid = planRet -> ncolblk[i3]; dynamicSharedMemorySize = planRet -> nshared; threadsPerBlock = threadsPerBlockSize_bres_calc; totalThreadNumber = threadsPerBlock * blocksPerGrid; kernelPointer = getKernel("bres_calc_kernel"); errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat3.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(cl_mem ),&opDat4.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(cl_mem ),&opDat5.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(cl_mem ),&opDat6.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,5,sizeof(cl_mem ),&planRet -> ind_maps[0]); errorCode = errorCode | clSetKernelArg(kernelPointer,6,sizeof(cl_mem ),&planRet -> ind_maps[1]); errorCode = errorCode | clSetKernelArg(kernelPointer,7,sizeof(cl_mem ),&planRet -> ind_maps[2]); errorCode = errorCode | clSetKernelArg(kernelPointer,8,sizeof(cl_mem ),&planRet -> ind_maps[3]); errorCode = errorCode | clSetKernelArg(kernelPointer,9,sizeof(cl_mem ),&planRet -> loc_maps[0]); errorCode = errorCode | clSetKernelArg(kernelPointer,10,sizeof(cl_mem ),&planRet -> loc_maps[1]); errorCode = errorCode | clSetKernelArg(kernelPointer,11,sizeof(cl_mem ),&planRet -> loc_maps[2]); errorCode = errorCode | clSetKernelArg(kernelPointer,12,sizeof(cl_mem ),&planRet -> loc_maps[3]); errorCode = errorCode | clSetKernelArg(kernelPointer,13,sizeof(cl_mem ),&planRet -> loc_maps[4]); errorCode = errorCode | clSetKernelArg(kernelPointer,14,sizeof(cl_mem ),&planRet -> ind_sizes); errorCode = errorCode | clSetKernelArg(kernelPointer,15,sizeof(cl_mem ),&planRet -> ind_offs); errorCode = errorCode | clSetKernelArg(kernelPointer,16,sizeof(cl_mem ),&planRet -> blkmap); errorCode = errorCode | clSetKernelArg(kernelPointer,17,sizeof(cl_mem ),&planRet -> offset); errorCode = errorCode | clSetKernelArg(kernelPointer,18,sizeof(cl_mem ),&planRet -> nelems); errorCode = errorCode | clSetKernelArg(kernelPointer,19,sizeof(cl_mem ),&planRet -> nthrcol); errorCode = errorCode | clSetKernelArg(kernelPointer,20,sizeof(cl_mem ),&planRet -> thrcol); errorCode = errorCode | clSetKernelArg(kernelPointer,21,sizeof(int ),&blockOffset); errorCode = errorCode | clSetKernelArg(kernelPointer,22,dynamicSharedMemorySize,NULL); errorCode = errorCode | clSetKernelArg(kernelPointer,23,sizeof(cl_mem ),&gm1_d); errorCode = errorCode | clSetKernelArg(kernelPointer,24,sizeof(cl_mem ),&qinf_d); errorCode = errorCode | clSetKernelArg(kernelPointer,25,sizeof(cl_mem ),&eps_d); assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments"); errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event); assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel"); errorCode = clFinish(cqCommandQueue); assert_m(errorCode == CL_SUCCESS,"Error completing device command queue"); blockOffset += blocksPerGrid; } op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[1].name = userSubroutine; OP_kernels[1].count = OP_kernels[1].count + 1; }
//#define AUTO_BLOCK_SIZE void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ cl_int ciErrNum; cl_event ceEvent; if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set CUDA execution parameters #ifdef AUTO_BLOCK_SIZE const size_t nthread = 1024; #else #ifdef OP_BLOCK_SIZE_0 const size_t nthread = OP_BLOCK_SIZE_0; #else // int nthread = OP_block_size; const size_t nthread = 128; #endif #endif const size_t nblocks = 200; const size_t n_tot_thread = nblocks * nthread; // work out shared memory requirements per element int nshared = 0; nshared = MAX(nshared,sizeof(float)*4); nshared = MAX(nshared,sizeof(float)*4); // execute plan int offset_s = nshared*OP_WARPSIZE; nshared = nshared*nthread; cl_kernel hKernel = getKernel( "op_cuda_save_soln" ); //nshared *= 4; //offset_s *= 4; int i = 0; ciErrNum = clSetKernelArg( hKernel, i++, sizeof(cl_mem), &(arg0.data_d) ); ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(cl_mem), &(arg1.data_d) ); ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(int), &offset_s ); ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(int), &set->size ); ciErrNum |= clSetKernelArg( hKernel, i++, nshared, NULL ); assert_m( ciErrNum == CL_SUCCESS, "error setting kernel arguments" ); #ifdef AUTO_BLOCK_SIZE ciErrNum = clEnqueueNDRangeKernel( cqCommandQueue, hKernel, 1, NULL, &n_tot_thread, NULL, 0, NULL, &ceEvent ); #else ciErrNum = clEnqueueNDRangeKernel( cqCommandQueue, hKernel, 1, NULL, &n_tot_thread, &nthread, 0, NULL, &ceEvent ); #endif assert_m( ciErrNum == CL_SUCCESS, "error executing kernel" ); #ifndef ASYNC ciErrNum = clFinish( cqCommandQueue ); assert_m( ciErrNum == CL_SUCCESS, "error completing device commands" ); #ifdef PROFILE unsigned long tqueue, tsubmit, tstart, tend, telapsed; ciErrNum = clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(tqueue), &tqueue, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(tsubmit), &tsubmit, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_END, sizeof(tend), &tend, NULL ); assert_m( ciErrNum == CL_SUCCESS, "error getting profiling info" ); OP_kernels[0].queue_time += (tsubmit - tqueue); OP_kernels[0].wait_time += (tstart - tsubmit); OP_kernels[0].execution_time += (tend - tstart); //printf("%20lu\n%20lu\n%20lu\n%20lu\n\n", tqueue, tsubmit, tstart, tend); //printf("queue: %8.4f\nwait:%8.4f\nexec: %8.4f\n\n", OP_kernels[0].queue_time * 1.0e-9, OP_kernels[0].wait_time * 1.0e-9, OP_kernels[0].execution_time * 1.0e-9 ); #endif // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; #endif }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ float *arg3h = (float *)arg3.data; float *arg4h = (float *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction float arg3_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg3_l[d+thr*64]=ZERO_float; float arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=arg4h[d]; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (float *) arg0.data, (float *) arg1.data, (float *) arg2.data, arg3_l + thr*64, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg3h[d] += arg3_l[d+thr*64]; for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] = MAX(arg4h[d],arg4_l[d+thr*64]); // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size; OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f; }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int niter; double rms; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); gam = 1.4f; gm1 = gam - 1.0f; cfl = 0.9f; eps = 0.05f; double mach = 0.4f; double alpha = 3.0f*atan(1.0f)/45.0f; double p = 1.0f; double r = 1.0f; double u = sqrt(gam*p/r)*mach; double e = p/(r*gm1) + 0.5f*u*u; qinf[0] = r; qinf[1] = r*u; qinf[2] = 0.0f; qinf[3] = r*e; char file[] = "new_grid.h5";//"new_grid-26mil.h5"; // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set edges = op_decl_set_hdf5(file, "edges"); op_set bedges = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pedge = op_decl_map_hdf5(edges, nodes, 2, file, "pedge"); op_map pecell = op_decl_map_hdf5(edges, cells,2, file, "pecell"); op_map pbedge = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge"); op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell"); op_map pcell = op_decl_map_hdf5(cells, nodes,4, file, "pcell"); op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int" ,file,"p_bound"); op_dat p_x = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x"); op_dat p_q = op_decl_dat_hdf5(cells ,4,"double",file,"p_q"); op_dat p_qold = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold"); op_dat p_adt = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt"); op_dat p_res = op_decl_dat_hdf5(cells ,4,"double",file,"p_res"); op_decl_const(1,"double",&gam ); op_decl_const(1,"double",&gm1 ); op_decl_const(1,"double",&cfl ); op_decl_const(1,"double",&eps ); op_decl_const(1,"double",&mach ); op_decl_const(1,"double",&alpha); op_decl_const(4,"double",qinf ); op_diagnostic_output(); int g_ncell = op_get_size(cells); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for(int iter=1; iter<=niter; iter++) { // save old flow solution op_par_loop(save_soln,"save_soln", cells, op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop(adt_calc,"adt_calc",cells, op_arg_dat(p_x, 0,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 1,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 2,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 3,pcell, 2,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_adt,-1,OP_ID, 1,"double",OP_WRITE)); // calculate flux residual op_par_loop(res_calc,"res_calc",edges, op_arg_dat(p_x, 0,pedge, 2,"double",OP_READ), op_arg_dat(p_x, 1,pedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pecell,4,"double",OP_READ), op_arg_dat(p_q, 1,pecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pecell,1,"double",OP_READ), op_arg_dat(p_adt, 1,pecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pecell,4,"double",OP_INC ), op_arg_dat(p_res, 1,pecell,4,"double",OP_INC )); op_par_loop(bres_calc,"bres_calc",bedges, op_arg_dat(p_x, 0,pbedge, 2,"double",OP_READ), op_arg_dat(p_x, 1,pbedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pbecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pbecell,4,"double",OP_INC ), op_arg_dat(p_bound,-1,OP_ID ,1,"int", OP_READ)); // update flow field rms = 0.0; op_par_loop(update,"update",cells, op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_WRITE), op_arg_dat(p_res, -1,OP_ID, 4,"double",OP_RW ), op_arg_dat(p_adt, -1,OP_ID, 1,"double",OP_READ ), op_arg_gbl(&rms,1,"double",OP_INC)); } // print iteration history rms = sqrt(rms/(double)g_ncell); if (iter%100 == 0) op_printf(" %d %10.5e \n",iter,rms); } op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_printf("Max total runtime = \n%f\n",wall_t2-wall_t1); op_exit(); }
int main(int argc, char **argv) { // OP initialisation op_init(argc, argv, 2); // MPI for user I/O int my_rank; int comm_size; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); // timer double cpu_t1, cpu_t2, wall_t1, wall_t2; int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; int nnode, ncell, nedge, nbedge, niter; /**------------------------BEGIN I/O and PARTITIONING -------------------**/ op_timers(&cpu_t1, &wall_t1); /* read in grid from disk on root processor */ FILE *fp; if ((fp = fopen("new_grid.dat", "r")) == NULL) { op_printf("can't open file new_grid.dat\n"); exit(-1); } int g_nnode, g_ncell, g_nedge, g_nbedge; check_scan( fscanf(fp, "%d %d %d %d \n", &g_nnode, &g_ncell, &g_nedge, &g_nbedge), 4); int *g_becell = 0, *g_ecell = 0, *g_bound = 0, *g_bedge = 0, *g_edge = 0, *g_cell = 0; double *g_x = 0, *g_q = 0, *g_qold = 0, *g_adt = 0, *g_res = 0; op_printf("reading in grid \n"); op_printf("Global number of nodes, cells, edges, bedges = %d, %d, %d, %d\n", g_nnode, g_ncell, g_nedge, g_nbedge); if (my_rank == MPI_ROOT) { g_cell = (int *)malloc(4 * g_ncell * sizeof(int)); g_edge = (int *)malloc(2 * g_nedge * sizeof(int)); g_ecell = (int *)malloc(2 * g_nedge * sizeof(int)); g_bedge = (int *)malloc(2 * g_nbedge * sizeof(int)); g_becell = (int *)malloc(g_nbedge * sizeof(int)); g_bound = (int *)malloc(g_nbedge * sizeof(int)); g_x = (double *)malloc(2 * g_nnode * sizeof(double)); g_q = (double *)malloc(4 * g_ncell * sizeof(double)); g_qold = (double *)malloc(4 * g_ncell * sizeof(double)); g_res = (double *)malloc(4 * g_ncell * sizeof(double)); g_adt = (double *)malloc(g_ncell * sizeof(double)); for (int n = 0; n < g_nnode; n++) { check_scan(fscanf(fp, "%lf %lf \n", &g_x[2 * n], &g_x[2 * n + 1]), 2); } for (int n = 0; n < g_ncell; n++) { check_scan(fscanf(fp, "%d %d %d %d \n", &g_cell[4 * n], &g_cell[4 * n + 1], &g_cell[4 * n + 2], &g_cell[4 * n + 3]), 4); } for (int n = 0; n < g_nedge; n++) { check_scan(fscanf(fp, "%d %d %d %d \n", &g_edge[2 * n], &g_edge[2 * n + 1], &g_ecell[2 * n], &g_ecell[2 * n + 1]), 4); } for (int n = 0; n < g_nbedge; n++) { check_scan(fscanf(fp, "%d %d %d %d \n", &g_bedge[2 * n], &g_bedge[2 * n + 1], &g_becell[n], &g_bound[n]), 4); } // initialise flow field and residual } fclose(fp); nnode = compute_local_size(g_nnode, comm_size, my_rank); ncell = compute_local_size(g_ncell, comm_size, my_rank); nedge = compute_local_size(g_nedge, comm_size, my_rank); nbedge = compute_local_size(g_nbedge, comm_size, my_rank); op_printf( "Number of nodes, cells, edges, bedges on process %d = %d, %d, %d, %d\n", my_rank, nnode, ncell, nedge, nbedge); /*Allocate memory to hold local sets, mapping tables and data*/ cell = (int *)malloc(4 * ncell * sizeof(int)); edge = (int *)malloc(2 * nedge * sizeof(int)); ecell = (int *)malloc(2 * nedge * sizeof(int)); bedge = (int *)malloc(2 * nbedge * sizeof(int)); becell = (int *)malloc(nbedge * sizeof(int)); bound = (int *)malloc(nbedge * sizeof(int)); x = (double *)malloc(2 * nnode * sizeof(double)); q = (double *)malloc(4 * ncell * sizeof(double)); qold = (double *)malloc(4 * ncell * sizeof(double)); res = (double *)malloc(4 * ncell * sizeof(double)); adt = (double *)malloc(ncell * sizeof(double)); /* scatter sets, mappings and data on sets*/ scatter_int_array(g_cell, cell, comm_size, g_ncell, ncell, 4); scatter_int_array(g_edge, edge, comm_size, g_nedge, nedge, 2); scatter_int_array(g_ecell, ecell, comm_size, g_nedge, nedge, 2); scatter_int_array(g_bedge, bedge, comm_size, g_nbedge, nbedge, 2); scatter_int_array(g_becell, becell, comm_size, g_nbedge, nbedge, 1); scatter_int_array(g_bound, bound, comm_size, g_nbedge, nbedge, 1); scatter_double_array(g_x, x, comm_size, g_nnode, nnode, 2); scatter_double_array(g_q, q, comm_size, g_ncell, ncell, 4); scatter_double_array(g_qold, qold, comm_size, g_ncell, ncell, 4); scatter_double_array(g_res, res, comm_size, g_ncell, ncell, 4); scatter_double_array(g_adt, adt, comm_size, g_ncell, ncell, 1); /*Freeing memory allocated to gloabal arrays on rank 0 after scattering to all processes*/ if (my_rank == MPI_ROOT) { free(g_cell); free(g_edge); free(g_ecell); free(g_bedge); free(g_becell); free(g_bound); free(g_x); free(g_q); free(g_qold); free(g_adt); free(g_res); } op_timers(&cpu_t2, &wall_t2); op_printf("Max total file read time = %f\n", wall_t2 - wall_t1); /**------------------------END I/O and PARTITIONING -----------------------**/ op_set edges = op_decl_set(nedge, "edges"); op_set cells = op_decl_set(ncell, "cells"); op_map pecell = op_decl_map(edges, cells, 2, ecell, "pecell"); op_dat p_res = op_decl_dat(cells, 4, "double", res, "p_res"); int count; // trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", cells, pecell, NULL); op_diagnostic_output(); // initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // indirect reduction count = 0; op_par_loop_res_calc("res_calc", edges, op_arg_dat(p_res, 0, pecell, 4, "double", OP_INC), op_arg_gbl(&count, 1, "int", OP_INC)); op_printf("number of edges:: %d should be: %d \n", count, g_nedge); if (count != g_nedge) op_printf("indirect reduction FAILED\n"); else op_printf("indirect reduction PASSED\n"); // direct reduction count = 0; op_par_loop_update("update", cells, op_arg_dat(p_res, -1, OP_ID, 4, "double", OP_RW), op_arg_gbl(&count, 1, "int", OP_INC)); op_printf("number of cells: %d should be: %d \n", count, g_ncell); if (count != g_ncell) op_printf("direct reduction FAILED\n"); else op_printf("direct reduction PASSED\n"); op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_exit(); free(cell); free(edge); free(ecell); free(bedge); free(becell); free(bound); free(x); free(q); free(qold); free(res); free(adt); }
int main(int argc, char **argv) { // OP initialisation op_init(argc, argv, 2); int niter; float rms; // timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); char file[] = "new_grid.h5"; // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set edges = op_decl_set_hdf5(file, "edges"); op_set bedges = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pedge = op_decl_map_hdf5(edges, nodes, 2, file, "pedge"); op_map pecell = op_decl_map_hdf5(edges, cells, 2, file, "pecell"); op_map pbedge = op_decl_map_hdf5(bedges, nodes, 2, file, "pbedge"); op_map pbecell = op_decl_map_hdf5(bedges, cells, 1, file, "pbecell"); op_map pcell = op_decl_map_hdf5(cells, nodes, 4, file, "pcell"); op_dat p_bound = op_decl_dat_hdf5(bedges, 1, "int", file, "p_bound"); op_dat p_x = op_decl_dat_hdf5(nodes, 2, "float", file, "p_x"); op_dat p_q = op_decl_dat_hdf5(cells, 4, "float", file, "p_q"); op_dat p_qold = op_decl_dat_hdf5(cells, 4, "float", file, "p_qold"); op_dat p_adt = op_decl_dat_hdf5(cells, 1, "float", file, "p_adt"); op_dat p_res = op_decl_dat_hdf5(cells, 4, "float", file, "p_res"); op_get_const_hdf5("gam", 1, "float", (char *)&gam, "new_grid.h5"); op_get_const_hdf5("gm1", 1, "float", (char *)&gm1, "new_grid.h5"); op_get_const_hdf5("cfl", 1, "float", (char *)&cfl, "new_grid.h5"); op_get_const_hdf5("eps", 1, "float", (char *)&eps, "new_grid.h5"); op_get_const_hdf5("mach", 1, "float", (char *)&mach, "new_grid.h5"); op_get_const_hdf5("alpha", 1, "float", (char *)&alpha, "new_grid.h5"); op_get_const_hdf5("qinf", 4, "float", (char *)&qinf, "new_grid.h5"); op_decl_const2("gam", 1, "float", &gam); op_decl_const2("gm1", 1, "float", &gm1); op_decl_const2("cfl", 1, "float", &cfl); op_decl_const2("eps", 1, "float", &eps); op_decl_const2("mach", 1, "float", &mach); op_decl_const2("alpha", 1, "float", &alpha); op_decl_const2("qinf", 4, "float", qinf); if (op_is_root()) op_diagnostic_output(); // trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x); // op_partition("PARMETIS", "KWAY", edges, pecell, p_x); int g_ncell = op_get_size(cells); // initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for (int iter = 1; iter <= niter; iter++) { // save old flow solution op_par_loop_save_soln("save_soln", cells, op_arg_dat(p_q, -1, OP_ID, 4, "float", OP_READ), op_arg_dat(p_qold, -1, OP_ID, 4, "float", OP_WRITE)); // predictor/corrector update loop for (int k = 0; k < 2; k++) { // calculate area/timstep op_par_loop_adt_calc("adt_calc", cells, op_arg_dat(p_x, 0, pcell, 2, "float", OP_READ), op_arg_dat(p_x, 1, pcell, 2, "float", OP_READ), op_arg_dat(p_x, 2, pcell, 2, "float", OP_READ), op_arg_dat(p_x, 3, pcell, 2, "float", OP_READ), op_arg_dat(p_q, -1, OP_ID, 4, "float", OP_READ), op_arg_dat(p_adt, -1, OP_ID, 1, "float", OP_WRITE)); // calculate flux residual op_par_loop_res_calc("res_calc", edges, op_arg_dat(p_x, 0, pedge, 2, "float", OP_READ), op_arg_dat(p_x, 1, pedge, 2, "float", OP_READ), op_arg_dat(p_q, 0, pecell, 4, "float", OP_READ), op_arg_dat(p_q, 1, pecell, 4, "float", OP_READ), op_arg_dat(p_adt, 0, pecell, 1, "float", OP_READ), op_arg_dat(p_adt, 1, pecell, 1, "float", OP_READ), op_arg_dat(p_res, 0, pecell, 4, "float", OP_INC), op_arg_dat(p_res, 1, pecell, 4, "float", OP_INC)); op_par_loop_bres_calc("bres_calc", bedges, op_arg_dat(p_x, 0, pbedge, 2, "float", OP_READ), op_arg_dat(p_x, 1, pbedge, 2, "float", OP_READ), op_arg_dat(p_q, 0, pbecell, 4, "float", OP_READ), op_arg_dat(p_adt, 0, pbecell, 1, "float", OP_READ), op_arg_dat(p_res, 0, pbecell, 4, "float", OP_INC), op_arg_dat(p_bound, -1, OP_ID, 1, "int", OP_READ)); // update flow field rms = 0.0; op_par_loop_update("update", cells, op_arg_dat(p_qold, -1, OP_ID, 4, "float", OP_READ), op_arg_dat(p_q, -1, OP_ID, 4, "float", OP_WRITE), op_arg_dat(p_res, -1, OP_ID, 4, "float", OP_RW), op_arg_dat(p_adt, -1, OP_ID, 1, "float", OP_READ), op_arg_gbl(&rms, 1, "float", OP_INC)); } // print iteration history rms = sqrtf(rms / (float)g_ncell); if (iter % 100 == 0) op_printf(" %d %10.5e \n", iter, rms); if (iter % 1000 == 0 && g_ncell == 720000) { // defailt mesh -- for validation testing op_printf(" %d %3.16f \n", iter, rms); float diff = fabsf((100.0 * (rms / 0.000105987)) - 100.0); op_printf("\n\nTest problem with %d cells is within %3.15E %% of the " "expected solution\n", 720000, diff); if (diff < 0.1) { op_printf("This test is considered PASSED\n"); } else { op_printf("This test is considered FAILED\n"); } } } op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_printf("Max total runtime = %f\n", wall_t2 - wall_t1); op_exit(); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int *bnode, *cell; double *xm;//, *q; int nnode,ncell,nbnodes,niter; double rms = 1; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); double gam = 1.4; gm1 = gam - 1.0; gm1i = 1.0/gm1; wtg1[0] = 0.5; wtg1[1] = 0.5; xi1[0] = 0.211324865405187; xi1[1] = 0.788675134594813; Ng1[0] = 0.788675134594813; Ng1[1] = 0.211324865405187; Ng1[2] = 0.211324865405187; Ng1[3] = 0.788675134594813; Ng1_xi[0] = -1; Ng1_xi[1] = -1; Ng1_xi[2] = 1; Ng1_xi[3] = 1; wtg2[0] = 0.25; wtg2[1] = 0.25; wtg2[2] = 0.25; wtg2[3] = 0.25; Ng2[0] = 0.622008467928146; Ng2[1] = 0.166666666666667; Ng2[2] = 0.166666666666667; Ng2[3] = 0.044658198738520; Ng2[4] = 0.166666666666667; Ng2[5] = 0.622008467928146; Ng2[6] = 0.044658198738520; Ng2[7] = 0.166666666666667; Ng2[8] = 0.166666666666667; Ng2[9] = 0.044658198738520; Ng2[10] = 0.622008467928146; Ng2[11] = 0.166666666666667; Ng2[12] = 0.044658198738520; Ng2[13] = 0.166666666666667; Ng2[14] = 0.166666666666667; Ng2[15] = 0.622008467928146; Ng2_xi[0] = -0.788675134594813; Ng2_xi[1] = 0.788675134594813; Ng2_xi[2] = -0.211324865405187;Ng2_xi[3] = 0.211324865405187; Ng2_xi[4] = -0.788675134594813; Ng2_xi[5] = 0.788675134594813; Ng2_xi[6] = -0.211324865405187; Ng2_xi[7] = 0.211324865405187; Ng2_xi[8] = -0.211324865405187; Ng2_xi[9] = 0.211324865405187; Ng2_xi[10] = -0.788675134594813; Ng2_xi[11] = 0.788675134594813; Ng2_xi[12] = -0.211324865405187; Ng2_xi[13] = 0.211324865405187; Ng2_xi[14] = -0.788675134594813; Ng2_xi[15] = 0.788675134594813; Ng2_xi[16] = -0.788675134594813; Ng2_xi[17] = -0.211324865405187; Ng2_xi[18] = 0.788675134594813; Ng2_xi[19] = 0.211324865405187; Ng2_xi[20] = -0.211324865405187; Ng2_xi[21] = -0.788675134594813; Ng2_xi[22] = 0.211324865405187; Ng2_xi[23] = 0.788675134594813; Ng2_xi[24] = -0.788675134594813; Ng2_xi[25] = -0.211324865405187; Ng2_xi[26] = 0.788675134594813; Ng2_xi[27] = 0.211324865405187; Ng2_xi[28] = -0.211324865405187; Ng2_xi[29] = -0.788675134594813; Ng2_xi[30] = 0.211324865405187; Ng2_xi[31] = 0.788675134594813; minf = 0.1; m2 = minf*minf; freq = 1; kappa = 1; nmode = 0; mfan = 1.0; char file[] = "FE_grid.h5"; // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set bnodes = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pbnodes = op_decl_map_hdf5(bnodes,nodes,1,file, "pbedge"); op_map pcell = op_decl_map_hdf5(cells, nodes,4,file, "pcell"); op_dat p_xm = op_decl_dat_hdf5(nodes ,2,"double", file, "p_x"); op_dat p_phim = op_decl_dat_hdf5(nodes, 1, "double", file, "p_phim"); op_dat p_resm = op_decl_dat_hdf5(nodes, 1, "double", file, "p_resm"); op_dat p_K = op_decl_dat_hdf5(cells, 16, "double:soa",file, "p_K"); op_dat p_V = op_decl_dat_hdf5(nodes, 1, "double", file, "p_V"); op_dat p_P = op_decl_dat_hdf5(nodes, 1, "double", file, "p_P"); op_dat p_U = op_decl_dat_hdf5(nodes, 1, "double", file, "p_U"); op_decl_const2("gam",1,"double",&gam ); op_decl_const2("gm1",1,"double",&gm1 ); op_decl_const2("gm1i",1,"double",&gm1i ); op_decl_const2("m2",1,"double",&m2 ); op_decl_const2("wtg1",2,"double",wtg1 ); op_decl_const2("xi1",2,"double",xi1 ); op_decl_const2("Ng1",4,"double",Ng1 ); op_decl_const2("Ng1_xi",4,"double",Ng1_xi ); op_decl_const2("wtg2",4,"double",wtg2 ); op_decl_const2("Ng2",16,"double",Ng2 ); op_decl_const2("Ng2_xi",32,"double",Ng2_xi ); op_decl_const2("minf",1,"double",&minf ); op_decl_const2("freq",1,"double",&freq ); op_decl_const2("kappa",1,"double",&kappa ); op_decl_const2("nmode",1,"double",&nmode ); op_decl_const2("mfan",1,"double",&mfan ); op_diagnostic_output(); op_partition("PTSCOTCH", "KWAY", cells, pcell, p_xm); op_printf("nodes: %d cells: %d bnodes: %d\n", nodes->size, cells->size, bnodes->size); nnode = op_get_size(nodes); ncell = op_get_size(cells); nbnodes = op_get_size(bnodes); double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 20; for(int iter=1; iter<=niter; iter++) { op_par_loop_res_calc("res_calc",cells, op_arg_dat(p_xm,-4,pcell,2,"double",OP_READ), op_arg_dat(p_phim,-4,pcell,1,"double",OP_READ), op_arg_dat(p_K,-1,OP_ID,16,"double:soa",OP_WRITE), op_arg_dat(p_resm,-4,pcell,1,"double",OP_INC)); op_par_loop_dirichlet("dirichlet",bnodes, op_arg_dat(p_resm,0,pbnodes,1,"double",OP_WRITE)); double c1 = 0; double c2 = 0; double c3 = 0; double alpha = 0; double beta = 0; //c1 = R'*R; op_par_loop_init_cg("init_cg",nodes, op_arg_dat(p_resm,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&c1,1,"double",OP_INC), op_arg_dat(p_U,-1,OP_ID,1,"double",OP_WRITE), op_arg_dat(p_V,-1,OP_ID,1,"double",OP_WRITE), op_arg_dat(p_P,-1,OP_ID,1,"double",OP_WRITE)); //set up stopping conditions double res0 = sqrt(c1); double res = res0; int iter = 0; int maxiter = 200; while (res > 0.1*res0 && iter < maxiter) { //V = Stiffness*P op_par_loop_spMV("spMV",cells, op_arg_dat(p_V,-4,pcell,1,"double",OP_INC), op_arg_dat(p_K,-1,OP_ID,16,"double:soa",OP_READ), op_arg_dat(p_P,-4,pcell,1,"double",OP_READ)); op_par_loop_dirichlet("dirichlet",bnodes, op_arg_dat(p_V,0,pbnodes,1,"double",OP_WRITE)); c2 = 0; //c2 = P'*V; op_par_loop_dotPV("dotPV",nodes, op_arg_dat(p_P,-1,OP_ID,1,"double",OP_READ), op_arg_dat(p_V,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&c2,1,"double",OP_INC)); alpha = c1/c2; //U = U + alpha*P; //resm = resm-alpha*V; op_par_loop_updateUR("updateUR",nodes, op_arg_dat(p_U,-1,OP_ID,1,"double",OP_INC), op_arg_dat(p_resm,-1,OP_ID,1,"double",OP_INC), op_arg_dat(p_P,-1,OP_ID,1,"double",OP_READ), op_arg_dat(p_V,-1,OP_ID,1,"double",OP_RW), op_arg_gbl(&alpha,1,"double",OP_READ)); c3 = 0; //c3 = resm'*resm; op_par_loop_dotR("dotR",nodes, op_arg_dat(p_resm,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&c3,1,"double",OP_INC)); beta = c3/c1; //P = beta*P+resm; op_par_loop_updateP("updateP",nodes, op_arg_dat(p_resm,-1,OP_ID,1,"double",OP_READ), op_arg_dat(p_P,-1,OP_ID,1,"double",OP_RW), op_arg_gbl(&beta,1,"double",OP_READ)); c1 = c3; res = sqrt(c1); iter++; } rms = 0; //phim = phim - Stiffness\Load; op_par_loop_update("update",nodes, op_arg_dat(p_phim,-1,OP_ID,1,"double",OP_RW), op_arg_dat(p_resm,-1,OP_ID,1,"double",OP_WRITE), op_arg_dat(p_U,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&rms,1,"double",OP_INC)); op_printf("rms = %10.5e iter: %d\n", sqrt(rms)/sqrt(nnode), iter); } op_timing_output(); op_timers(&cpu_t2, &wall_t2); op_printf("Max total runtime = %f\n",wall_t2-wall_t1); op_exit(); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); int niter; double rms; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); char file[] = "new_grid.h5"; // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set edges = op_decl_set_hdf5(file, "edges"); op_set bedges = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pedge = op_decl_map_hdf5(edges, nodes, 2, file, "pedge"); op_map pecell = op_decl_map_hdf5(edges, cells,2, file, "pecell"); op_map pbedge = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge"); op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell"); op_map pcell = op_decl_map_hdf5(cells, nodes,4, file, "pcell"); op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int" ,file,"p_bound"); op_dat p_x = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x"); op_dat p_q = op_decl_dat_hdf5(cells ,4,"double",file,"p_q"); op_dat p_qold = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold"); op_dat p_adt = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt"); op_dat p_res = op_decl_dat_hdf5(cells ,4,"double",file,"p_res"); op_get_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid.h5"); op_get_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid.h5"); op_get_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid.h5"); op_get_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid.h5"); op_get_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid.h5"); op_get_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid.h5"); op_get_const_hdf5("qinf", 4, "double", (char *)&qinf, "new_grid.h5"); op_decl_const(1,"double",&gam ); op_decl_const(1,"double",&gm1 ); op_decl_const(1,"double",&cfl ); op_decl_const(1,"double",&eps ); op_decl_const(1,"double",&mach ); op_decl_const(1,"double",&alpha); op_decl_const(4,"double",qinf ); op_diagnostic_output(); //write back original data just to compare you read the file correctly //do an h5diff between new_grid_out.h5 and new_grid.h5 to //compare two hdf5 files op_write_hdf5("new_grid_out.h5"); op_write_const_hdf5("gam",1,"double",(char *)&gam, "new_grid_out.h5"); op_write_const_hdf5("gm1",1,"double",(char *)&gm1, "new_grid_out.h5"); op_write_const_hdf5("cfl",1,"double",(char *)&cfl, "new_grid_out.h5"); op_write_const_hdf5("eps",1,"double",(char *)&eps, "new_grid_out.h5"); op_write_const_hdf5("mach",1,"double",(char *)&mach, "new_grid_out.h5"); op_write_const_hdf5("alpha",1,"double",(char *)&alpha, "new_grid_out.h5"); op_write_const_hdf5("qinf",4,"double",(char *)qinf, "new_grid_out.h5"); //trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x); int g_ncell = op_get_size(cells); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for(int iter=1; iter<=niter; iter++) { // save old flow solution op_par_loop(save_soln,"save_soln", cells, op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop(adt_calc,"adt_calc",cells, op_arg_dat(p_x, -4,pcell, 2,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_adt,-1,OP_ID, 1,"double",OP_WRITE)); // calculate flux residual op_par_loop(res_calc,"res_calc",edges, op_arg_dat(p_x, -2,pedge, 2,"double",OP_READ), op_arg_dat(p_q, -2,pecell,4,"double",OP_READ), op_arg_dat(p_adt, -2,pecell,1,"double",OP_READ), op_arg_dat(p_res, -2,pecell,4,"double",OP_INC )); op_par_loop(bres_calc,"bres_calc",bedges, op_arg_dat(p_x, -2,pbedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pbecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pbecell,4,"double",OP_INC ), op_arg_dat(p_bound,-1,OP_ID ,1,"int", OP_READ)); // update flow field rms = 0.0; op_par_loop(update,"update",cells, op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_WRITE), op_arg_dat(p_res, -1,OP_ID, 4,"double",OP_RW ), op_arg_dat(p_adt, -1,OP_ID, 1,"double",OP_READ ), op_arg_gbl(&rms,1,"double",OP_INC)); } // print iteration history rms = sqrt(rms/(double)g_ncell); if (iter%100 == 0) op_printf(" %d %10.5e \n",iter,rms); } op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_printf("Max total runtime = \n%f\n",wall_t2-wall_t1); op_exit(); }
// // main program // int main(int argc, char **argv){ int my_rank; int comm_size; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; double time; double max_time; int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; int niter; double rms; op_timers(&cpu_t1, &wall_t1); // set constants if(my_rank == MPI_ROOT )printf("initialising flow field\n"); gam = 1.4f; gm1 = gam - 1.0f; cfl = 0.9f; eps = 0.05f; double mach = 0.4f; double alpha = 3.0f*atan(1.0f)/45.0f; double p = 1.0f; double r = 1.0f; double u = sqrt(gam*p/r)*mach; double e = p/(r*gm1) + 0.5f*u*u; qinf[0] = r; qinf[1] = r*u; qinf[2] = 0.0f; qinf[3] = r*e; // OP initialisation op_init(argc,argv,2); /**------------------------BEGIN Parallel I/O -------------------**/ char file[] = "new_grid.h5";//"new_grid-26mil.h5";//"new_grid.h5"; // declare sets, pointers, datasets and global constants - reading in from file op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set edges = op_decl_set_hdf5(file, "edges"); op_set bedges = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pedge = op_decl_map_hdf5(edges, nodes, 2, file, "pedge"); op_map pecell = op_decl_map_hdf5(edges, cells,2, file, "pecell"); op_map pbedge = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge"); op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell"); op_map pcell = op_decl_map_hdf5(cells, nodes,4, file, "pcell"); op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int" ,file,"p_bound"); op_dat p_x = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x"); op_dat p_q = op_decl_dat_hdf5(cells ,4,"double",file,"p_q"); op_dat p_qold = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold"); op_dat p_adt = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt"); op_dat p_res = op_decl_dat_hdf5(cells ,4,"double",file,"p_res"); /**------------------------END Parallel I/O -----------------------**/ op_timers(&cpu_t2, &wall_t2); time = wall_t2-wall_t1; MPI_Reduce(&time,&max_time,1,MPI_DOUBLE, MPI_MAX,MPI_ROOT, MPI_COMM_WORLD); if(my_rank==MPI_ROOT)printf("Max total file read time = %f\n",max_time); op_decl_const(1,"double",&gam ); op_decl_const(1,"double",&gm1 ); op_decl_const(1,"double",&cfl ); op_decl_const(1,"double",&eps ); op_decl_const(1,"double",&mach ); op_decl_const(1,"double",&alpha); op_decl_const(4,"double",qinf ); op_diagnostic_output(); //write back original data just to compare you read the file correctly //do an h5diff between new_grid_writeback.h5 and new_grid.h5 to //compare two hdf5 files op_write_hdf5("new_grid_out.h5"); //partition with ParMetis //op_partition_geom(p_x); //op_partition_random(cells); //op_partition_kway(pecell); //op_partition_geomkway(p_x, pcell); //partition with PT-Scotch op_partition_ptscotch(pecell); //create halos op_halo_create(); int g_ncell = 0; int* sizes = (int *)malloc(sizeof(int)*comm_size); MPI_Allgather(&cells->size, 1, MPI_INT, sizes, 1, MPI_INT, MPI_COMM_WORLD); for(int i = 0; i<comm_size; i++)g_ncell = g_ncell + sizes[i]; free(sizes); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); niter = 1000; for(int iter=1; iter<=niter; iter++) { //save old flow solution op_par_loop(save_soln,"save_soln", cells, op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop(adt_calc,"adt_calc",cells, op_arg_dat(p_x, 0,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 1,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 2,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 3,pcell, 2,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_adt,-1,OP_ID, 1,"double",OP_WRITE)); // calculate flux residual op_par_loop(res_calc,"res_calc",edges, op_arg_dat(p_x, 0,pedge, 2,"double",OP_READ), op_arg_dat(p_x, 1,pedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pecell,4,"double",OP_READ), op_arg_dat(p_q, 1,pecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pecell,1,"double",OP_READ), op_arg_dat(p_adt, 1,pecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pecell,4,"double",OP_INC ), op_arg_dat(p_res, 1,pecell,4,"double",OP_INC )); op_par_loop(bres_calc,"bres_calc",bedges, op_arg_dat(p_x, 0,pbedge, 2,"double",OP_READ), op_arg_dat(p_x, 1,pbedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pbecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pbecell,4,"double",OP_INC ), op_arg_dat(p_bound,-1,OP_ID ,1,"int", OP_READ)); // update flow field rms = 0.0; op_par_loop(update,"update",cells, op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_WRITE), op_arg_dat(p_res, -1,OP_ID, 4,"double",OP_RW ), op_arg_dat(p_adt, -1,OP_ID, 1,"double",OP_READ ), op_arg_gbl(&rms,1,"double",OP_INC)); } //print iteration history if(my_rank==MPI_ROOT) { rms = sqrt(rms/(double) g_ncell); if (iter%100 == 0) printf("%d %10.5e \n",iter,rms); } } op_timers(&cpu_t2, &wall_t2); //get results data array op_dat temp = op_mpi_get_data(p_q); //output the result dat array to files //op_write_hdf5("new_grid_out.h5"); //compress using // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5 //free memory allocated to halos op_halo_destroy(); //return all op_dats, op_maps back to original element order op_partition_reverse(); //print each mpi process's timing info for each kernel op_mpi_timing_output(); //print total time for niter interations time = wall_t2-wall_t1; MPI_Reduce(&time,&max_time,1,MPI_DOUBLE, MPI_MAX,MPI_ROOT, MPI_COMM_WORLD); if(my_rank==MPI_ROOT)printf("Max total runtime = %f\n",max_time); op_exit(); MPI_Finalize(); //user mpi finalize }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, Plan->ind_maps[0], (float *)arg2.data, Plan->ind_maps[1], (float *)arg0.data, Plan->loc_maps[1], Plan->loc_maps[2], (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); int *groups = (int *)malloc(size * sizeof(int)); int *groups2 = (int *)malloc(size * sizeof(int)); int my_type = 1; //This is to be read from a configuration file MPI_Allgather(&my_type, 1, MPI_INT, groups, 1, MPI_INT, MPI_COMM_WORLD); int num_groups = 0; for (int i = 0; i < size; i++) num_groups = num_groups > groups[i] ? num_groups : groups[i]; num_groups++; //The global group MPI_Group global_grp; MPI_Comm_group(MPI_COMM_WORLD, &global_grp); //Create sub-groups and sub-communicators MPI_Group mpigroups[num_groups]; MPI_Comm mpicomms[num_groups]; int count = 0; for (int i = 0; i < num_groups; ++i) { count = 0; for (int j = 0; j < size; ++j) { if (groups[j] == i) { groups2[count++] = j; } } MPI_Group_incl(global_grp, count, groups2, &mpigroups[i]); MPI_Comm_create(MPI_COMM_WORLD, mpigroups[i], &mpicomms[i]); } //coupling procs for (int i = 0; i < 1; ++i) { count = 0; for (int j = 0; j < size; ++j) { if (groups[j] == i) { groups2[count++] = j; } } } // OP initialisation op_mpi_init(argc,argv,2,MPI_COMM_WORLD, mpicomms[1]); int niter; double rms; //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; // set constants and initialise flow field and residual op_printf("initialising flow field \n"); char file[] = "new_grid.h5"; // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set_hdf5(file, "nodes"); op_set edges = op_decl_set_hdf5(file, "edges"); op_set bedges = op_decl_set_hdf5(file, "bedges"); op_set cells = op_decl_set_hdf5(file, "cells"); op_map pedge = op_decl_map_hdf5(edges, nodes, 2, file, "pedge"); op_map pecell = op_decl_map_hdf5(edges, cells,2, file, "pecell"); op_map pbedge = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge"); op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell"); op_map pcell = op_decl_map_hdf5(cells, nodes,4, file, "pcell"); op_map pbndbnd = op_decl_map_hdf5(bedges, bedges,1, file, "pbndbnd"); op_map m_test = op_decl_map_hdf5(cells, nodes,4, file, "m_test"); if (m_test == NULL) printf("m_test not found\n"); op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int" ,file,"p_bound"); op_dat p_x = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x"); op_dat p_q = op_decl_dat_hdf5(cells ,4,"double",file,"p_q"); op_dat p_qold = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold"); op_dat p_adt = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt"); op_dat p_res = op_decl_dat_hdf5(cells ,4,"double",file,"p_res"); op_dat p_test = op_decl_dat_hdf5(cells ,4,"double",file,"p_test"); if (p_test == NULL) printf("p_test not found\n"); op_get_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid.h5"); op_get_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid.h5"); op_get_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid.h5"); op_get_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid.h5"); op_get_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid.h5"); op_get_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid.h5"); op_get_const_hdf5("qinf", 4, "double", (char *)&qinf, "new_grid.h5"); op_decl_const(1,"double",&gam ); op_decl_const(1,"double",&gm1 ); op_decl_const(1,"double",&cfl ); op_decl_const(1,"double",&eps ); op_decl_const(1,"double",&mach ); op_decl_const(1,"double",&alpha); op_decl_const(4,"double",qinf ); op_diagnostic_output(); //write back original data just to compare you read the file correctly //do an h5diff between new_grid_out.h5 and new_grid.h5 to //compare two hdf5 files op_dump_to_hdf5("new_grid_out.h5"); op_write_const_hdf5("gam",1,"double",(char *)&gam, "new_grid_out.h5"); op_write_const_hdf5("gm1",1,"double",(char *)&gm1, "new_grid_out.h5"); op_write_const_hdf5("cfl",1,"double",(char *)&cfl, "new_grid_out.h5"); op_write_const_hdf5("eps",1,"double",(char *)&eps, "new_grid_out.h5"); op_write_const_hdf5("mach",1,"double",(char *)&mach, "new_grid_out.h5"); op_write_const_hdf5("alpha",1,"double",(char *)&alpha, "new_grid_out.h5"); op_write_const_hdf5("qinf",4,"double",(char *)qinf, "new_grid_out.h5"); //trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x); //op_partition("PARMETIS", "KWAY", edges, pecell, p_x); int g_ncell = op_get_size(cells); //create some temporaries so we can exchange data defined on the boundary double *ptr = NULL; op_dat center = op_decl_dat_temp(bedges, 3, "double", ptr, "center"); op_dat pres = op_decl_dat_temp(bedges, 1, "double", ptr, "pres"); int *ptr2 = NULL; op_dat p_bound2 = op_decl_dat_temp(bedges, 1, "int", ptr2, "p_bound2"); op_dat center2 = op_decl_dat_temp(bedges, 3, "double", ptr, "center2"); op_dat pres2 = op_decl_dat_temp(bedges, 1, "double", ptr, "pres2"); //create import and export handles op_export_handle handle = op_export_init(count, groups2, pbndbnd); op_import_handle handle2 = op_import_init(count, groups2, center); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); // main time-marching loop niter = 1000; for(int iter=1; iter<=niter; iter++) { // save old flow solution op_par_loop(save_soln,"save_soln", cells, op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop(adt_calc,"adt_calc",cells, op_arg_dat(p_x, 0,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 1,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 2,pcell, 2,"double",OP_READ ), op_arg_dat(p_x, 3,pcell, 2,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_adt,-1,OP_ID, 1,"double",OP_WRITE)); // calculate flux residual op_par_loop(res_calc,"res_calc",edges, op_arg_dat(p_x, 0,pedge, 2,"double",OP_READ), op_arg_dat(p_x, 1,pedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pecell,4,"double",OP_READ), op_arg_dat(p_q, 1,pecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pecell,1,"double",OP_READ), op_arg_dat(p_adt, 1,pecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pecell,4,"double",OP_INC ), op_arg_dat(p_res, 1,pecell,4,"double",OP_INC )); op_par_loop(bres_calc,"bres_calc",bedges, op_arg_dat(p_x, 0,pbedge, 2,"double",OP_READ), op_arg_dat(p_x, 1,pbedge, 2,"double",OP_READ), op_arg_dat(p_q, 0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt, 0,pbecell,1,"double",OP_READ), op_arg_dat(p_res, 0,pbecell,4,"double",OP_INC ), op_arg_dat(p_bound,-1,OP_ID ,1,"int", OP_READ), op_arg_dat(center, -1, OP_ID, 3, "double", OP_WRITE), op_arg_dat(pres, -1, OP_ID, 1, "double", OP_WRITE)); // update flow field rms = 0.0; op_par_loop(update,"update",cells, op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_READ ), op_arg_dat(p_q, -1,OP_ID, 4,"double",OP_WRITE), op_arg_dat(p_res, -1,OP_ID, 4,"double",OP_RW ), op_arg_dat(p_adt, -1,OP_ID, 1,"double",OP_READ ), op_arg_gbl(&rms,1,"double",OP_INC)); } // print iteration history rms = sqrt(rms/(double)g_ncell); if (iter%100 == 0) { op_printf(" %d %10.5e \n",iter,rms); //Export data op_dat arr[] = {p_bound, center, pres}; op_export_data(handle, 3, arr); //Import data op_dat arr2[] = {p_bound2, center2, pres2}; op_import_data(handle2, 3, arr2); //check whether the two are the same op_par_loop(comparethem, "comparethem", bedges, op_arg_dat(p_bound,-1, OP_ID, 1, "int", OP_READ), op_arg_dat(p_bound2,-1, OP_ID, 1, "int", OP_READ), op_arg_dat(center,-1, OP_ID, 3, "double", OP_READ), op_arg_dat(center2,-1, OP_ID, 3, "double", OP_READ), op_arg_dat(pres,-1, OP_ID, 1, "double", OP_READ), op_arg_dat(pres2,-1, OP_ID, 1, "double", OP_READ)); } } op_timers(&cpu_t2, &wall_t2); double* q = (double *)malloc(sizeof(double)*op_get_size(cells)*4); op_fetch_data_hdf5(p_q, q, 0, op_get_size(cells)-1); free(q); op_fetch_data_hdf5_file(p_q, "file_name.h5"); //printf("Root process = %d\n",op_is_root()); //output the result dat array to files //op_write_hdf5("new_grid_out.h5"); //compress using // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5 op_timing_output(); op_printf("Max total runtime = \n%f\n",wall_t2-wall_t1); op_exit(); }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); //MPI for user I/O int my_rank; int comm_size; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; int *bnode, *cell, *g_bnode, *g_cell; double *xm, *g_xm;; int nnode,ncell,nbnodes,niter, g_nnode, g_ncell, g_nbnodes; double rms = 1; // read in grid op_printf("reading in grid \n"); FILE *fp; if ( (fp = fopen("FE_grid.dat","r")) == NULL) { op_printf("can't open file FE_grid.dat\n"); exit(-1); } if (fscanf(fp,"%d %d %d \n",&g_nnode, &g_ncell, &g_nbnodes) != 3) { op_printf("error reading from new_grid.dat\n"); exit(-1); } if (my_rank == MPI_ROOT) { g_cell = (int *) malloc(4*g_ncell*sizeof(int)); g_bnode = (int *) malloc(g_nbnodes*sizeof(int)); g_xm = (double *) malloc(2*g_nnode*sizeof(double)); for (int n=0; n<g_nnode; n++) { if (fscanf(fp,"%lf %lf \n",&g_xm[2*n], &g_xm[2*n+1]) != 2) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<g_ncell; n++) { if (fscanf(fp,"%d %d %d %d \n",&g_cell[4*n ], &g_cell[4*n+1], &g_cell[4*n+2], &g_cell[4*n+3]) != 4) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } for (int n=0; n<g_nbnodes; n++) { if (fscanf(fp,"%d \n",&g_bnode[n]) != 1) { op_printf("error reading from new_grid.dat\n"); exit(-1); } } } fclose(fp); nnode = compute_local_size (g_nnode, comm_size, my_rank); ncell = compute_local_size (g_ncell, comm_size, my_rank); nbnodes = compute_local_size (g_nbnodes, comm_size, my_rank); cell = (int *) malloc(4*ncell*sizeof(int)); bnode = (int *) malloc(nbnodes*sizeof(int)); xm = (double *) malloc(2*nnode*sizeof(double)); scatter_int_array(g_cell, cell, comm_size, g_ncell,ncell, 4); scatter_int_array(g_bnode, bnode, comm_size, g_nbnodes,nbnodes, 1); scatter_double_array(g_xm, xm, comm_size, g_nnode,nnode, 2); if(my_rank == MPI_ROOT) { free(g_cell); free(g_xm); free(g_bnode); } // set constants and initialise flow field and residual op_printf("initialising flow field \n"); double gam = 1.4; gm1 = gam - 1.0; gm1i = 1.0/gm1; wtg1[0] = 0.5; wtg1[1] = 0.5; xi1[0] = 0.211324865405187; xi1[1] = 0.788675134594813; Ng1[0] = 0.788675134594813; Ng1[1] = 0.211324865405187; Ng1[2] = 0.211324865405187; Ng1[3] = 0.788675134594813; Ng1_xi[0] = -1; Ng1_xi[1] = -1; Ng1_xi[2] = 1; Ng1_xi[3] = 1; wtg2[0] = 0.25; wtg2[1] = 0.25; wtg2[2] = 0.25; wtg2[3] = 0.25; Ng2[0] = 0.622008467928146; Ng2[1] = 0.166666666666667; Ng2[2] = 0.166666666666667; Ng2[3] = 0.044658198738520; Ng2[4] = 0.166666666666667; Ng2[5] = 0.622008467928146; Ng2[6] = 0.044658198738520; Ng2[7] = 0.166666666666667; Ng2[8] = 0.166666666666667; Ng2[9] = 0.044658198738520; Ng2[10] = 0.622008467928146; Ng2[11] = 0.166666666666667; Ng2[12] = 0.044658198738520; Ng2[13] = 0.166666666666667; Ng2[14] = 0.166666666666667; Ng2[15] = 0.622008467928146; Ng2_xi[0] = -0.788675134594813; Ng2_xi[1] = 0.788675134594813; Ng2_xi[2] = -0.211324865405187;Ng2_xi[3] = 0.211324865405187; Ng2_xi[4] = -0.788675134594813; Ng2_xi[5] = 0.788675134594813; Ng2_xi[6] = -0.211324865405187; Ng2_xi[7] = 0.211324865405187; Ng2_xi[8] = -0.211324865405187; Ng2_xi[9] = 0.211324865405187; Ng2_xi[10] = -0.788675134594813; Ng2_xi[11] = 0.788675134594813; Ng2_xi[12] = -0.211324865405187; Ng2_xi[13] = 0.211324865405187; Ng2_xi[14] = -0.788675134594813; Ng2_xi[15] = 0.788675134594813; Ng2_xi[16] = -0.788675134594813; Ng2_xi[17] = -0.211324865405187; Ng2_xi[18] = 0.788675134594813; Ng2_xi[19] = 0.211324865405187; Ng2_xi[20] = -0.211324865405187; Ng2_xi[21] = -0.788675134594813; Ng2_xi[22] = 0.211324865405187; Ng2_xi[23] = 0.788675134594813; Ng2_xi[24] = -0.788675134594813; Ng2_xi[25] = -0.211324865405187; Ng2_xi[26] = 0.788675134594813; Ng2_xi[27] = 0.211324865405187; Ng2_xi[28] = -0.211324865405187; Ng2_xi[29] = -0.788675134594813; Ng2_xi[30] = 0.211324865405187; Ng2_xi[31] = 0.788675134594813; minf = 0.1; m2 = minf*minf; freq = 1; kappa = 1; nmode = 0; mfan = 1.0; double *phim = (double *)malloc(nnode*sizeof(double)); memset(phim,0,nnode*sizeof(double)); for (int i = 0;i<nnode;i++) { phim[i] = minf*xm[2*i]; } double *K = (double *)malloc(4*4*ncell*sizeof(double)); memset(K,0,4*4*ncell*sizeof(double)); double *resm = (double *)malloc(nnode*sizeof(double)); memset(resm,0,nnode*sizeof(double)); double *V = (double *)malloc(nnode*sizeof(double)); memset(V,0,nnode*sizeof(double)); double *P = (double *)malloc(nnode*sizeof(double)); memset(P,0,nnode*sizeof(double)); double *U = (double *)malloc(nnode*sizeof(double)); memset(U,0,nnode*sizeof(double)); // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set(nnode, "nodes"); op_set bnodes = op_decl_set(nbnodes, "bedges"); op_set cells = op_decl_set(ncell, "cells"); op_map pbnodes = op_decl_map(bnodes,nodes,1,bnode, "pbedge"); op_map pcell = op_decl_map(cells, nodes,4,cell, "pcell"); op_dat p_xm = op_decl_dat(nodes ,2,"double",xm ,"p_x"); op_dat p_phim = op_decl_dat(nodes, 1, "double", phim, "p_phim"); op_dat p_resm = op_decl_dat(nodes, 1, "double", resm, "p_resm"); op_dat p_K = op_decl_dat(cells, 16, "double:soa", K, "p_K"); op_dat p_V = op_decl_dat(nodes, 1, "double", V, "p_V"); op_dat p_P = op_decl_dat(nodes, 1, "double", P, "p_P"); op_dat p_U = op_decl_dat(nodes, 1, "double", U, "p_U"); op_decl_const(1,"double",&gam ); op_decl_const(1,"double",&gm1 ); op_decl_const(1,"double",&gm1i ); op_decl_const(1,"double",&m2 ); op_decl_const(2,"double",wtg1 ); op_decl_const(2,"double",xi1 ); op_decl_const(4,"double",Ng1 ); op_decl_const(4,"double",Ng1_xi ); op_decl_const(4,"double",wtg2 ); op_decl_const(16,"double",Ng2 ); op_decl_const(32,"double",Ng2_xi ); op_decl_const(1,"double",&minf ); op_decl_const(1,"double",&freq ); op_decl_const(1,"double",&kappa ); op_decl_const(1,"double",&nmode ); op_decl_const(1,"double",&mfan ); op_diagnostic_output(); op_partition("PTSCOTCH", "KWAY", cells, pcell, NULL); // main time-marching loop niter = 20; //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); for(int iter=1; iter<=niter; iter++) { op_par_loop(res_calc,"res_calc",cells, op_arg_dat(p_xm, -4, pcell, 2,"double",OP_READ), op_arg_dat(p_phim, -4, pcell, 1,"double",OP_READ), op_arg_dat(p_K, -1, OP_ID, 16,"double:soa",OP_WRITE), op_arg_dat(p_resm, -4, pcell, 1,"double",OP_INC) ); op_par_loop(dirichlet,"dirichlet",bnodes, op_arg_dat(p_resm, 0, pbnodes, 1,"double",OP_WRITE)); double c1 = 0; double c2 = 0; double c3 = 0; double alpha = 0; double beta = 0; //c1 = R'*R; op_par_loop(init_cg, "init_cg", nodes, op_arg_dat(p_resm, -1, OP_ID, 1, "double", OP_READ), op_arg_gbl(&c1, 1, "double", OP_INC), op_arg_dat(p_U, -1, OP_ID, 1, "double", OP_WRITE), op_arg_dat(p_V, -1, OP_ID, 1, "double", OP_WRITE), op_arg_dat(p_P, -1, OP_ID, 1, "double", OP_WRITE)); //set up stopping conditions double res0 = sqrt(c1); double res = res0; int inner_iter = 0; int maxiter = 200; while (res > 0.1*res0 && inner_iter < maxiter) { //V = Stiffness*P op_par_loop(spMV, "spMV", cells, op_arg_dat(p_V, -4, pcell, 1, "double", OP_INC), op_arg_dat(p_K, -1, OP_ID, 16, "double:soa", OP_READ), op_arg_dat(p_P, -4, pcell, 1, "double", OP_READ)); op_par_loop(dirichlet,"dirichlet",bnodes, op_arg_dat(p_V, 0, pbnodes, 1,"double",OP_WRITE)); c2 = 0; //c2 = P'*V; op_par_loop(dotPV, "dotPV", nodes, op_arg_dat(p_P, -1, OP_ID, 1, "double", OP_READ), op_arg_dat(p_V, -1, OP_ID, 1, "double", OP_READ), op_arg_gbl(&c2, 1, "double", OP_INC)); alpha = c1/c2; //U = U + alpha*P; //resm = resm-alpha*V; op_par_loop(updateUR, "updateUR", nodes, op_arg_dat(p_U, -1, OP_ID, 1, "double", OP_INC), op_arg_dat(p_resm, -1, OP_ID, 1, "double", OP_INC), op_arg_dat(p_P, -1, OP_ID, 1, "double", OP_READ), op_arg_dat(p_V, -1, OP_ID, 1, "double", OP_RW), op_arg_gbl(&alpha, 1, "double", OP_READ)); c3 = 0; //c3 = resm'*resm; op_par_loop(dotR, "dotR", nodes, op_arg_dat(p_resm, -1, OP_ID, 1, "double", OP_READ), op_arg_gbl(&c3, 1, "double", OP_INC)); beta = c3/c1; //P = beta*P+resm; op_par_loop(updateP, "updateP", nodes, op_arg_dat(p_resm, -1, OP_ID, 1, "double", OP_READ), op_arg_dat(p_P, -1, OP_ID, 1, "double", OP_RW), op_arg_gbl(&beta, 1, "double", OP_READ)); c1 = c3; res = sqrt(c1); inner_iter++; } rms = 0; //phim = phim - Stiffness\Load; op_par_loop(update, "update", nodes, op_arg_dat(p_phim, -1, OP_ID, 1, "double", OP_RW), op_arg_dat(p_resm, -1, OP_ID, 1, "double", OP_WRITE), op_arg_dat(p_U, -1, OP_ID, 1, "double", OP_READ), op_arg_gbl(&rms, 1, "double", OP_INC)); op_printf("rms = %10.5e iter: %d\n", sqrt(rms)/sqrt(g_nnode), inner_iter); } op_timers(&cpu_t2, &wall_t2); op_timing_output(); op_printf("Max total runtime = %f\n",wall_t2-wall_t1); op_exit(); /*free(cell); free(bnode); free(xm); free(phim); free(K); free(resm); free(V); free(P); free(U);*/ }
int main(int argc, char **argv) { // OP initialisation op_init(argc,argv,2); //MPI for user I/O int my_rank; int comm_size; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); //timer double cpu_t1, cpu_t2, wall_t1, wall_t2; int *becell, *ecell, *bound, *bedge, *edge, *cell; double *x, *q, *qold, *adt, *res; int nnode,ncell,nedge,nbedge,niter; double rms; /**------------------------BEGIN I/O and PARTITIONING -------------------**/ op_timers(&cpu_t1, &wall_t1); /* read in grid from disk on root processor */ FILE *fp; if ( (fp = fopen("new_grid.dat","r")) == NULL) { op_printf("can't open file new_grid.dat\n"); exit(-1); } int g_nnode,g_ncell,g_nedge,g_nbedge; check_scan(fscanf(fp,"%d %d %d %d \n",&g_nnode, &g_ncell, &g_nedge, &g_nbedge), 4); int *g_becell = 0, *g_ecell = 0, *g_bound = 0, *g_bedge = 0, *g_edge = 0, *g_cell = 0; double *g_x = 0,*g_q = 0, *g_qold = 0, *g_adt = 0, *g_res = 0; // set constants op_printf("initialising flow field\n"); gam = 1.4f; gm1 = gam - 1.0f; cfl = 0.9f; eps = 0.05f; double mach = 0.4f; double alpha = 3.0f*atan(1.0f)/45.0f; double p = 1.0f; double r = 1.0f; double u = sqrt(gam*p/r)*mach; double e = p/(r*gm1) + 0.5f*u*u; qinf[0] = r; qinf[1] = r*u; qinf[2] = 0.0f; qinf[3] = r*e; op_printf("reading in grid \n"); op_printf("Global number of nodes, cells, edges, bedges = %d, %d, %d, %d\n" ,g_nnode,g_ncell,g_nedge,g_nbedge); if(my_rank == MPI_ROOT) { g_cell = (int *) malloc(4*g_ncell*sizeof(int)); g_edge = (int *) malloc(2*g_nedge*sizeof(int)); g_ecell = (int *) malloc(2*g_nedge*sizeof(int)); g_bedge = (int *) malloc(2*g_nbedge*sizeof(int)); g_becell = (int *) malloc( g_nbedge*sizeof(int)); g_bound = (int *) malloc( g_nbedge*sizeof(int)); g_x = (double *) malloc(2*g_nnode*sizeof(double)); g_q = (double *) malloc(4*g_ncell*sizeof(double)); g_qold = (double *) malloc(4*g_ncell*sizeof(double)); g_res = (double *) malloc(4*g_ncell*sizeof(double)); g_adt = (double *) malloc( g_ncell*sizeof(double)); for (int n=0; n<g_nnode; n++){ check_scan(fscanf(fp,"%lf %lf \n",&g_x[2*n], &g_x[2*n+1]), 2); } for (int n=0; n<g_ncell; n++) { check_scan(fscanf(fp,"%d %d %d %d \n",&g_cell[4*n ], &g_cell[4*n+1], &g_cell[4*n+2], &g_cell[4*n+3]), 4); } for (int n=0; n<g_nedge; n++) { check_scan(fscanf(fp,"%d %d %d %d \n",&g_edge[2*n],&g_edge[2*n+1], &g_ecell[2*n],&g_ecell[2*n+1]), 4); } for (int n=0; n<g_nbedge; n++) { check_scan(fscanf(fp,"%d %d %d %d \n",&g_bedge[2*n],&g_bedge[2*n+1], &g_becell[n],&g_bound[n]), 4); } //initialise flow field and residual for (int n=0; n<g_ncell; n++) { for (int m=0; m<4; m++) { g_q[4*n+m] = qinf[m]; g_res[4*n+m] = 0.0f; } } } fclose(fp); nnode = compute_local_size (g_nnode, comm_size, my_rank); ncell = compute_local_size (g_ncell, comm_size, my_rank); nedge = compute_local_size (g_nedge, comm_size, my_rank); nbedge = compute_local_size (g_nbedge, comm_size, my_rank); op_printf("Number of nodes, cells, edges, bedges on process %d = %d, %d, %d, %d\n" ,my_rank,nnode,ncell,nedge,nbedge); /*Allocate memory to hold local sets, mapping tables and data*/ cell = (int *) malloc(4*ncell*sizeof(int)); edge = (int *) malloc(2*nedge*sizeof(int)); ecell = (int *) malloc(2*nedge*sizeof(int)); bedge = (int *) malloc(2*nbedge*sizeof(int)); becell = (int *) malloc( nbedge*sizeof(int)); bound = (int *) malloc( nbedge*sizeof(int)); x = (double *) malloc(2*nnode*sizeof(double)); q = (double *) malloc(4*ncell*sizeof(double)); qold = (double *) malloc(4*ncell*sizeof(double)); res = (double *) malloc(4*ncell*sizeof(double)); adt = (double *) malloc( ncell*sizeof(double)); /* scatter sets, mappings and data on sets*/ scatter_int_array(g_cell, cell, comm_size, g_ncell,ncell, 4); scatter_int_array(g_edge, edge, comm_size, g_nedge,nedge, 2); scatter_int_array(g_ecell, ecell, comm_size, g_nedge,nedge, 2); scatter_int_array(g_bedge, bedge, comm_size, g_nbedge,nbedge, 2); scatter_int_array(g_becell, becell, comm_size, g_nbedge,nbedge, 1); scatter_int_array(g_bound, bound, comm_size, g_nbedge,nbedge, 1); scatter_double_array(g_x, x, comm_size, g_nnode,nnode, 2); scatter_double_array(g_q, q, comm_size, g_ncell,ncell, 4); scatter_double_array(g_qold, qold, comm_size, g_ncell,ncell, 4); scatter_double_array(g_res, res, comm_size, g_ncell,ncell, 4); scatter_double_array(g_adt, adt, comm_size, g_ncell,ncell, 1); /*Freeing memory allocated to gloabal arrays on rank 0 after scattering to all processes*/ if(my_rank == MPI_ROOT) { free(g_cell); free(g_edge); free(g_ecell); free(g_bedge); free(g_becell); free(g_bound); free(g_x ); free(g_q); free(g_qold); free(g_adt); free(g_res); } op_timers(&cpu_t2, &wall_t2); op_printf("Max total file read time = %f\n", wall_t2-wall_t1); /**------------------------END I/O and PARTITIONING -----------------------**/ // declare sets, pointers, datasets and global constants op_set nodes = op_decl_set(nnode, "nodes"); op_set edges = op_decl_set(nedge, "edges"); op_set bedges = op_decl_set(nbedge, "bedges"); op_set cells = op_decl_set(ncell, "cells"); op_map pedge = op_decl_map(edges, nodes,2,edge, "pedge"); op_map pecell = op_decl_map(edges, cells,2,ecell, "pecell"); op_map pbedge = op_decl_map(bedges,nodes,2,bedge, "pbedge"); op_map pbecell = op_decl_map(bedges,cells,1,becell,"pbecell"); op_map pcell = op_decl_map(cells, nodes,4,cell, "pcell"); op_dat p_bound = op_decl_dat(bedges,1,"int" ,bound,"p_bound"); op_dat p_x = op_decl_dat(nodes ,2,"double",x ,"p_x"); op_dat p_q = op_decl_dat(cells ,4,"double",q ,"p_q"); //op_dat p_qold = op_decl_dat(cells ,4,"double",qold ,"p_qold"); //op_dat p_adt = op_decl_dat(cells ,1,"double",adt ,"p_adt"); //op_dat p_res = op_decl_dat(cells ,4,"double",res ,"p_res"); // p_res, p_adt and p_qold now declared as a temp op_dats during // the execution of the time-marching loop op_decl_const2("gam",1,"double",&gam ); op_decl_const2("gm1",1,"double",&gm1 ); op_decl_const2("cfl",1,"double",&cfl ); op_decl_const2("eps",1,"double",&eps ); op_decl_const2("mach",1,"double",&mach ); op_decl_const2("alpha",1,"double",&alpha); op_decl_const2("qinf",4,"double",qinf ); op_diagnostic_output(); //trigger partitioning and halo creation routines op_partition("PTSCOTCH", "KWAY", cells, pecell, p_x); //initialise timers for total execution wall time op_timers(&cpu_t1, &wall_t1); niter = 1000; for(int iter=1; iter<=niter; iter++) { double* tmp_elem = NULL; op_dat p_res = op_decl_dat_temp(cells ,4,"double",tmp_elem,"p_res"); op_dat p_adt = op_decl_dat_temp(cells ,1,"double",tmp_elem,"p_adt"); op_dat p_qold = op_decl_dat_temp(cells ,4,"double",qold ,"p_qold"); //save old flow solution op_par_loop_save_soln("save_soln",cells, op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_WRITE)); // predictor/corrector update loop for(int k=0; k<2; k++) { // calculate area/timstep op_par_loop_adt_calc("adt_calc",cells, op_arg_dat(p_x,0,pcell,2,"double",OP_READ), op_arg_dat(p_x,1,pcell,2,"double",OP_READ), op_arg_dat(p_x,2,pcell,2,"double",OP_READ), op_arg_dat(p_x,3,pcell,2,"double",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_WRITE)); // calculate flux residual op_par_loop_res_calc("res_calc",edges, op_arg_dat(p_x,0,pedge,2,"double",OP_READ), op_arg_dat(p_x,1,pedge,2,"double",OP_READ), op_arg_dat(p_q,0,pecell,4,"double",OP_READ), op_arg_dat(p_q,1,pecell,4,"double",OP_READ), op_arg_dat(p_adt,0,pecell,1,"double",OP_READ), op_arg_dat(p_adt,1,pecell,1,"double",OP_READ), op_arg_dat(p_res,0,pecell,4,"double",OP_INC), op_arg_dat(p_res,1,pecell,4,"double",OP_INC)); op_par_loop_bres_calc("bres_calc",bedges, op_arg_dat(p_x,0,pbedge,2,"double",OP_READ), op_arg_dat(p_x,1,pbedge,2,"double",OP_READ), op_arg_dat(p_q,0,pbecell,4,"double",OP_READ), op_arg_dat(p_adt,0,pbecell,1,"double",OP_READ), op_arg_dat(p_res,0,pbecell,4,"double",OP_INC), op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ)); // update flow field rms = 0.0; op_par_loop_update("update",cells, op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_READ), op_arg_dat(p_q,-1,OP_ID,4,"double",OP_WRITE), op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW), op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_READ), op_arg_gbl(&rms,1,"double",OP_INC)); } //print iteration history rms = sqrt(rms/(double) g_ncell); if (iter%100 == 0) op_printf("%d %10.5e \n",iter,rms); if (op_free_dat_temp(p_res) < 0) op_printf("Error: temporary op_dat %s cannot be removed\n",p_res->name); if (op_free_dat_temp(p_adt) < 0) op_printf("Error: temporary op_dat %s cannot be removed\n",p_adt->name); if (op_free_dat_temp(p_qold) < 0) op_printf("Error: temporary op_dat %s cannot be removed\n",p_qold->name); } op_timers(&cpu_t2, &wall_t2); op_timing_output(); //print total time for niter interations op_printf("Max total runtime = %f\n",wall_t2-wall_t1); op_exit(); free(cell); free(edge); free(ecell); free(bedge); free(becell); free(bound); free(x); free(q); free(qold); free(res); free(adt); }
void op_write_hdf5(char const * file_name) { printf("Writing to %s\n",file_name); //declare timers double cpu_t1, cpu_t2, wall_t1, wall_t2; double time; double max_time; op_timers(&cpu_t1, &wall_t1); //timer start for hdf5 file write //create new communicator int my_rank, comm_size; MPI_Comm_dup(MPI_COMM_WORLD, &OP_MPI_HDF5_WORLD); MPI_Comm_rank(OP_MPI_HDF5_WORLD, &my_rank); MPI_Comm_size(OP_MPI_HDF5_WORLD, &comm_size); //MPI variables MPI_Info info = MPI_INFO_NULL; //HDF5 APIs definitions hid_t file_id; //file identifier hid_t plist_id; //property list identifier hid_t dset_id = 0; //dataset identifier hid_t dataspace; //data space identifier hid_t memspace; //memory space identifier hsize_t dimsf[2]; // dataset dimensions hsize_t count[2]; //hyperslab selection parameters hsize_t offset[2]; //Set up file access property list with parallel I/O access plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, OP_MPI_HDF5_WORLD, info); //Create a new file collectively and release property list identifier. file_id = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); H5Pclose(plist_id); /*loop over all the op_sets and write them to file*/ for(int s=0; s<OP_set_index; s++) { op_set set=OP_set_list[s]; //Create the dataspace for the dataset. hsize_t dimsf_set[] = {1}; dataspace = H5Screate_simple(1, dimsf_set, NULL); //Create the dataset with default properties and close dataspace. dset_id = H5Dcreate(file_id, set->name, H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); //Create property list for collective dataset write. plist_id = H5Pcreate(H5P_DATASET_XFER); H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); int size = 0; int* sizes = (int *)xmalloc(sizeof(int)*comm_size); MPI_Allgather(&set->size, 1, MPI_INT, sizes, 1, MPI_INT, OP_MPI_HDF5_WORLD); for(int i = 0; i<comm_size; i++)size = size + sizes[i]; //write data H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, plist_id, &size); H5Sclose(dataspace); H5Pclose(plist_id); H5Dclose(dset_id); } /*loop over all the op_maps and write them to file*/ for(int m=0; m<OP_map_index; m++) { op_map map=OP_map_list[m]; //find total size of map int* sizes = (int *)xmalloc(sizeof(int)*comm_size); int g_size = 0; MPI_Allgather(&map->from->size, 1, MPI_INT, sizes, 1, MPI_INT, OP_MPI_HDF5_WORLD); for(int i = 0; i<comm_size; i++)g_size = g_size + sizes[i]; //Create the dataspace for the dataset. dimsf[0] = g_size; dimsf[1] = map->dim; dataspace = H5Screate_simple(2, dimsf, NULL); //Create the dataset with default properties and close dataspace. if(sizeof(map->map[0]) == sizeof(int)) dset_id = H5Dcreate(file_id, map->name, H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); else if(sizeof(map->map[0]) == sizeof(long)) dset_id = H5Dcreate(file_id, map->name, H5T_NATIVE_LONG, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); else if(sizeof(map->map[0]) == sizeof(long long)) dset_id = H5Dcreate(file_id, map->name, H5T_NATIVE_LLONG, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); H5Sclose(dataspace); //Each process defines dataset in memory and writes it to a hyperslab //in the file. int disp = 0; for(int i = 0; i<my_rank; i++)disp = disp + sizes[i]; count[0] = map->from->size; count[1] = dimsf[1]; offset[0] = disp; offset[1] = 0; memspace = H5Screate_simple(2, count, NULL); //Select hyperslab in the file. dataspace = H5Dget_space(dset_id); H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, offset, NULL, count, NULL); //Create property list for collective dataset write. plist_id = H5Pcreate(H5P_DATASET_XFER); H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); //write data if(sizeof(map->map[0]) == sizeof(int)) H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, dataspace, plist_id, map->map); else if(sizeof(map->map[0]) == sizeof(long)) H5Dwrite(dset_id, H5T_NATIVE_LONG, memspace, dataspace, plist_id, map->map); else if(sizeof(map->map[0]) == sizeof(long long)) H5Dwrite(dset_id, H5T_NATIVE_LLONG, memspace, dataspace, plist_id, map->map); H5Pclose(plist_id); H5Sclose(memspace); H5Sclose(dataspace); H5Dclose(dset_id); free(sizes); /*attach attributes to map*/ //open existing data set dset_id = H5Dopen(file_id, map->name, H5P_DEFAULT); //create the data space for the attribute hsize_t dims = 1; dataspace = H5Screate_simple(1, &dims, NULL); //Create an int attribute - size hid_t attribute = H5Acreate(dset_id, "size", H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT); //Write the attribute data. H5Awrite(attribute, H5T_NATIVE_INT, &g_size); //Close the attribute. H5Aclose(attribute); //Create an int attribute - dimension attribute = H5Acreate(dset_id, "dim", H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT); //Write the attribute data. H5Awrite(attribute, H5T_NATIVE_INT, &map->dim); //Close the attribute. H5Aclose(attribute); H5Sclose(dataspace); //Create an string attribute - type dataspace= H5Screate(H5S_SCALAR); hid_t atype = H5Tcopy(H5T_C_S1); H5Tset_size(atype, 10); attribute = H5Acreate(dset_id, "type", atype, dataspace, H5P_DEFAULT, H5P_DEFAULT); if(sizeof(map->map[0]) == sizeof(int)) H5Awrite(attribute, atype, "int"); if(sizeof(map->map[0]) == sizeof(long)) H5Awrite(attribute, atype, "long"); if(sizeof(map->map[0]) == sizeof(long long)) H5Awrite(attribute, atype, "long long"); H5Aclose(attribute); //Close the dataspace H5Sclose(dataspace); //Close to the dataset. H5Dclose(dset_id); } /*loop over all the op_dats and write them to file*/ for(int d=0; d<OP_dat_index; d++) { op_dat dat=OP_dat_list[d]; //find total size of map int* sizes = (int *)xmalloc(sizeof(int)*comm_size); int g_size = 0; MPI_Allgather(&dat->set->size, 1, MPI_INT, sizes, 1, MPI_INT, OP_MPI_HDF5_WORLD); for(int i = 0; i<comm_size; i++)g_size = g_size + sizes[i]; //Create the dataspace for the dataset. dimsf[0] = g_size; dimsf[1] = dat->dim; dataspace = H5Screate_simple(2, dimsf, NULL); //Create the dataset with default properties and close dataspace. if(strcmp(dat->type,"double")==0) dset_id = H5Dcreate(file_id, dat->name, H5T_NATIVE_DOUBLE, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); else if(strcmp(dat->type,"float")==0) dset_id = H5Dcreate(file_id, dat->name, H5T_NATIVE_FLOAT, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); else if(strcmp(dat->type,"int")==0) dset_id = H5Dcreate(file_id, dat->name, H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); else printf("Unknown type\n"); H5Sclose(dataspace); //Each process defines dataset in memory and writes it to a hyperslab //in the file. int disp = 0; for(int i = 0; i<my_rank; i++)disp = disp + sizes[i]; count[0] = dat->set->size; count[1] = dimsf[1]; offset[0] = disp; offset[1] = 0; memspace = H5Screate_simple(2, count, NULL); //Select hyperslab in the file. dataspace = H5Dget_space(dset_id); H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, offset, NULL, count, NULL); //Create property list for collective dataset write. plist_id = H5Pcreate(H5P_DATASET_XFER); H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); //write data if(strcmp(dat->type,"double") == 0) H5Dwrite(dset_id, H5T_NATIVE_DOUBLE, memspace, dataspace, plist_id, dat->data); else if(strcmp(dat->type,"float") == 0) H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, dataspace, plist_id, dat->data); else if(strcmp(dat->type,"int") == 0) H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, dataspace, plist_id, dat->data); else printf("Unknown type\n"); H5Pclose(plist_id); H5Sclose(memspace); H5Sclose(dataspace); H5Dclose(dset_id); free(sizes); /*attach attributes to dat*/ //open existing data set dset_id = H5Dopen(file_id, dat->name, H5P_DEFAULT); //create the data space for the attribute hsize_t dims = 1; dataspace = H5Screate_simple(1, &dims, NULL); //Create an int attribute - size hid_t attribute = H5Acreate(dset_id, "size", H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT); //Write the attribute data. H5Awrite(attribute, H5T_NATIVE_INT, &dat->size); //Close the attribute. H5Aclose(attribute); //Create an int attribute - dimension attribute = H5Acreate(dset_id, "dim", H5T_NATIVE_INT, dataspace, H5P_DEFAULT, H5P_DEFAULT); //Write the attribute data. H5Awrite(attribute, H5T_NATIVE_INT, &dat->dim); H5Aclose(attribute); H5Sclose(dataspace); //Create an string attribute - type dataspace= H5Screate(H5S_SCALAR); hid_t atype = H5Tcopy(H5T_C_S1); H5Tset_size(atype, 10); attribute = H5Acreate(dset_id, "type", atype, dataspace, H5P_DEFAULT, H5P_DEFAULT); H5Awrite(attribute, atype, dat->type); H5Aclose(attribute); //Close the dataspace. H5Sclose(dataspace); H5Dclose(dset_id); } H5Fclose(file_id); op_timers(&cpu_t2, &wall_t2); //timer stop for hdf5 file write //compute import/export lists creation time time = wall_t2-wall_t1; MPI_Reduce(&time, &max_time, 1, MPI_DOUBLE, MPI_MAX, MPI_ROOT, OP_MPI_HDF5_WORLD); //print performance results if(my_rank == MPI_ROOT) { printf("Max hdf5 file write time = %lf\n\n",max_time); } MPI_Comm_free(&OP_MPI_HDF5_WORLD); }