int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split, const int nthreads, const int t_per_atom) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); #endif _threads_per_atom=t_per_atom; _threads_per_charge=t_per_atom; if (_device_init) return 0; _device_init=true; _comm_world=world; _comm_replica=replica; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); MPI_Comm_size(_comm_world,&_world_size); // Get the rank/size within the replica MPI_Comm_rank(_comm_replica,&_replica_me); MPI_Comm_size(_comm_replica,&_replica_size); // Get the names of all nodes int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); // Get the number of procs per node std::map<std::string,int> name_map; std::map<std::string,int>::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/ (last_gpu-first_gpu+1))); int my_gpu=node_rank/_procs_per_gpu+first_gpu; // Time on the device only if 1 proc per gpu _time_device=true; if (_procs_per_gpu>1) _time_device=false; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return -2; gpu->set(my_gpu); _long_range_precompute=0; int flag=compile_kernels(); return flag; }
int main(int argc, char **argv) { //Common miniMD settings In in; in.datafile = NULL; int me=0; //local MPI rank int nprocs=1; //number of MPI ranks int num_threads=32; //number of Threads per Block threads int num_steps=-1; //number of timesteps (if -1 use value from lj.in) int system_size=-1; //size of the system (if -1 use value from lj.in) int check_safeexchange=0; //if 1 complain if atom moves further than 1 subdomain length between exchanges int do_safeexchange=0; //if 1 use safe exchange mode [allows exchange over multiple subdomains] int use_sse=0; //setting for SSE variant of miniMD only int screen_yaml=0; //print yaml output to screen also int yaml_output=0; //print yaml output int halfneigh=0; //1: use half neighborlist; 0: use full neighborlist; -1: use original miniMD version half neighborlist force char* input_file = NULL; int ghost_newton = 0; int skip_gpu = 999; int neighbor_size = -1; //OpenCL specific int platform = 0; int device = 0; int subdevice = -1; int ppn = 2; int use_tex = 0; int threads_per_atom = 1; int map_device=0; for(int i = 0; i < argc; i++) { if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--input_file") == 0)) { input_file = argv[++i]; continue; } if((strcmp(argv[i],"-p")==0)||(strcmp(argv[i],"--platform")==0)) {platform=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-d")==0)||(strcmp(argv[i],"--device")==0)) {device=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-sd")==0)||(strcmp(argv[i],"--subdevice")==0)) {subdevice=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-sd_map")==0)||(strcmp(argv[i],"--subdevice_mapping")==0)) {subdevice=1-me%ppn; continue;} if((strcmp(argv[i],"-ng")==0)||(strcmp(argv[i],"--num_gpus")==0)) {ppn=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-dm")==0)||(strcmp(argv[i],"--device_map")==0)) {map_device=1; continue;} } MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if(map_device) {device = me%ppn; if(device>=skip_gpu) device++;} OpenCLWrapper* opencl = new OpenCLWrapper; if( me == 0) printf("# Platforms: %i\n",opencl->num_platforms); printf("# Proc: %i using device %i\n",me,device); opencl->Init(argc,argv,device,device+1,NULL,platform,subdevice); int error = 0; if(input_file == NULL) error = input(in, "in.lj.miniMD"); else error = input(in, input_file); if (error) { MPI_Finalize(); exit(0); } for(int i=0;i<argc;i++) { if((strcmp(argv[i],"-t")==0)||(strcmp(argv[i],"--num_threads")==0)) {num_threads=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-n")==0)||(strcmp(argv[i],"--nsteps")==0)) {num_steps=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-s")==0)||(strcmp(argv[i],"--size")==0)) {system_size=atoi(argv[++i]); continue;} if((strcmp(argv[i],"--half_neigh")==0)) {halfneigh=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-sse")==0)) {use_sse=atoi(argv[++i]); continue;} if((strcmp(argv[i],"--check_exchange")==0)) {check_safeexchange=1; continue;} if((strcmp(argv[i],"-o")==0)||(strcmp(argv[i],"--yaml_output")==0)) {yaml_output=atoi(argv[++i]); continue;} if((strcmp(argv[i],"--yaml_screen")==0)) {screen_yaml=1; continue;} if((strcmp(argv[i], "-f") == 0) || (strcmp(argv[i], "--data_file") == 0)) { if(in.datafile == NULL) in.datafile = new char[1000]; strcpy(in.datafile, argv[++i]); continue; } if((strcmp(argv[i], "-u") == 0) || (strcmp(argv[i], "--units") == 0)) { in.units = strcmp(argv[++i], "metal") == 0 ? 1 : 0; continue; } if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--force") == 0)) { in.forcetype = strcmp(argv[++i], "eam") == 0 ? FORCEEAM : FORCELJ; continue; } if((strcmp(argv[i], "-gn") == 0) || (strcmp(argv[i], "--ghost_newton") == 0)) { ghost_newton = atoi(argv[++i]); continue; } if((strcmp(argv[i], "--skip_gpu") == 0)) { skip_gpu = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-b") == 0) || (strcmp(argv[i], "--neigh_bins") == 0)) { neighbor_size = atoi(argv[++i]); continue; } if((strcmp(argv[i],"-tex")==0)||(strcmp(argv[i],"--texture")==0)) {use_tex=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-tpa")==0)) {threads_per_atom=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-h")==0)||(strcmp(argv[i],"--help")==0)) { printf("\n---------------------------------------------------------\n"); printf("-------------" VARIANT_STRING "------------\n"); printf("---------------------------------------------------------\n\n"); printf("miniMD is a simple, parallel molecular dynamics (MD) code,\n" "which is part of the Mantevo project at Sandia National\n" "Laboratories ( http://www.mantevo.org ).\n" "The original authors of MPI based miniMD are Steve Plimpton ([email protected]) ,\n" "Paul Crozier ([email protected]) with current versions \n" "written by Christian Trott ([email protected]).\n\n"); printf("Commandline Options:\n"); printf("\n Execution configuration:\n"); printf("\t-t / --num_threads <threads>: set number of threads per block (default 32)\n"); printf("\t--half_neigh <int>: use half neighborlists (default 0)\n" "\t 0: full neighborlist\n" "\t 1: half neighborlist (not supported in OpenCL variant)\n" "\t -1: original miniMD half neighborlist force \n" "\t (not supported in OpenCL variant)\n"); printf("\t-d / --device <int>: select device (default 0)\n"); printf("\t-dm / --device_map: map devices to MPI ranks\n"); printf("\t-ng / --num_gpus <int>: give number of GPUs per Node (used in conjuction with -dm\n" "\t to determine device id: 'id=mpi_rank%%ng' (default 2)\n"); printf("\t--skip_gpu <int>: skip the specified gpu when assigning devices to MPI ranks\n" "\t used in conjunction with -dm (but must come first in arg list)\n"); printf("\t-p / --platform <int>: select platform (default 0)\n"); printf("\t-sse <sse_version>: use explicit sse intrinsics (use miniMD-SSE variant)\n"); printf("\t-gn / --ghost_newton <int>: set usage of newtons third law for ghost atoms\n" "\t (only applicable with half neighborlists)\n"); printf("\n Simulation setup:\n"); printf("\t-i / --input_file <string>: set input file to be used (default: in.lj.miniMD)\n"); printf("\t-n / --nsteps <nsteps>: set number of timesteps for simulation\n"); printf("\t-s / --size <size>: set linear dimension of systembox and neighbor bins\n"); printf("\t-b / --neigh_bins <int>: set linear dimension of neighbor bin grid\n"); printf("\t-u / --units <string>: set units (lj or metal), see LAMMPS documentation\n"); printf("\t-p / --force <string>: set interaction model (lj or eam)\n"); printf("\t-f / --data_file <string>: read configuration from LAMMPS data file\n"); printf("\n Miscelaneous:\n"); printf("\t--check_exchange: check whether atoms moved further than subdomain width\n"); printf("\t--safe_exchange: perform exchange communication with all MPI processes\n" "\t within rcut_neighbor (outer force cutoff)\n"); printf("\t--yaml_output <int>: level of yaml output (default 0)\n"); printf("\t--yaml_screen: write yaml output also to screen\n"); printf("\t-tex / --texture <int>: use texture cache in force kernel (default 0)\n"); printf("\t-h / --help: display this help message\n\n"); printf("---------------------------------------------------------\n\n"); exit(0); } } Atom atom; Force force; Neighbor neighbor; Integrate integrate; Thermo thermo; Comm comm; Timer timer; ThreadData threads; if(in.forcetype == FORCEEAM) { printf("ERROR: " VARIANT_STRING " does not yet support EAM simulations. Exiting.\n"); MPI_Finalize(); exit(0); } if(ghost_newton!=0) { if(me ==0 ) printf("ERROR: -ghost_newton %i is not supported in " VARIANT_STRING ". Exiting.\n",ghost_newton); MPI_Finalize(); exit(0); } if(halfneigh!=0) { if(me ==0 ) printf("ERROR: -half_neigh %i is not supported in " VARIANT_STRING ". Exiting.\n",halfneigh); MPI_Finalize(); exit(0); } if(halfneigh!=0) { if(me ==0 ) printf("ERROR: -half_neigh %i is not supported in " VARIANT_STRING ". Exiting.\n",halfneigh); MPI_Finalize(); exit(0); } if(use_tex!=0) { if(me ==0 ) printf("ERROR: -tex %i is currently broken. Exiting.\n",use_tex); MPI_Finalize(); exit(0); } if(use_sse) { #ifndef VARIANT_SSE if(me ==0 ) printf("ERROR: Trying to run with -sse with miniMD reference version. Use SSE variant instead. Exiting.\n"); MPI_Finalize(); exit(0); #endif } threads.mpi_me=me; threads.mpi_num_threads=nprocs; threads.omp_me=0; threads.omp_num_threads=num_threads; atom.threads = &threads; comm.threads = &threads; force.threads = &threads; integrate.threads = &threads; neighbor.threads = &threads; thermo.threads = &threads; opencl->blockdim = num_threads; atom.threads_per_atom = threads_per_atom; atom.use_tex = use_tex; comm.do_safeexchange=do_safeexchange; force.use_sse=use_sse; neighbor.halfneigh=halfneigh; compile_kernels(opencl); integrate.opencl = opencl; force.opencl = opencl; neighbor.opencl = opencl; atom.opencl = opencl; comm.opencl = opencl; if(num_steps > 0) in.ntimes = num_steps; if(system_size > 0) { in.nx = system_size; in.ny = system_size; in.nz = system_size; } if(neighbor_size > 0) { neighbor.nbinx = neighbor_size; neighbor.nbiny = neighbor_size; neighbor.nbinz = neighbor_size; } if(neighbor_size < 0 && in.datafile == NULL) { MMD_float neighscale = 5.0 / 6.0; neighbor.nbinx = neighscale * in.nx; neighbor.nbiny = neighscale * in.ny; neighbor.nbinz = neighscale * in.ny; } if(neighbor_size < 0 && in.datafile) neighbor.nbinx = -1; if(neighbor.nbinx == 0) neighbor.nbinx = 1; if(neighbor.nbiny == 0) neighbor.nbiny = 1; if(neighbor.nbinz == 0) neighbor.nbinz = 1; integrate.ntimes = in.ntimes; integrate.dt = in.dt; neighbor.every = in.neigh_every; neighbor.cutneigh = in.neigh_cut; force.cutforce = in.force_cut; thermo.nstat = in.thermo_nstat; if(me == 0) printf("# Create System:\n"); if(in.datafile) { read_lammps_data(atom, comm, neighbor, integrate, thermo, in.datafile, in.units); MMD_float volume = atom.box.xprd * atom.box.yprd * atom.box.zprd; in.rho = 1.0 * atom.natoms / volume; force.setup(); } else { create_box(atom, in.nx, in.ny, in.nz, in.rho); comm.setup(neighbor.cutneigh, atom); neighbor.setup(atom); integrate.setup(); force.setup(); create_atoms(atom, in.nx, in.ny, in.nz, in.rho); thermo.setup(in.rho, integrate, atom, in.units); create_velocity(in.t_request, atom, thermo); } if(me == 0) printf("# Done .... \n"); if(me == 0) { fprintf(stdout, "# " VARIANT_STRING " output ...\n"); fprintf(stdout, "# Systemparameters: \n"); fprintf(stdout, "\t# MPI processes: %i\n", neighbor.threads->mpi_num_threads); fprintf(stdout, "\t# OpenMP threads: %i\n", neighbor.threads->omp_num_threads); fprintf(stdout, "\t# Inputfile: %s\n", input_file == 0 ? "in.lj.miniMD" : input_file); fprintf(stdout, "\t# Datafile: %s\n", in.datafile ? in.datafile : "None"); fprintf(stdout, "\t# ForceStyle: %s\n", in.forcetype == FORCELJ ? "LJ" : "EAM"); fprintf(stdout, "\t# Units: %s\n", in.units == 0 ? "LJ" : "METAL"); fprintf(stdout, "\t# Atoms: %i\n", atom.natoms); fprintf(stdout, "\t# System size: %2.2lf %2.2lf %2.2lf (unit cells: %i %i %i)\n", atom.box.xprd, atom.box.yprd, atom.box.zprd, in.nx, in.ny, in.nz); fprintf(stdout, "\t# Density: %lf\n", in.rho); fprintf(stdout, "\t# Force cutoff: %lf\n", force.cutforce); fprintf(stdout, "\t# Neigh cutoff: %lf\n", neighbor.cutneigh); fprintf(stdout, "\t# Half neighborlists: %i\n", neighbor.halfneigh); fprintf(stdout, "\t# Neighbor bins: %i %i %i\n", neighbor.nbinx, neighbor.nbiny, neighbor.nbinz); fprintf(stdout, "\t# Neighbor frequency: %i\n", neighbor.every); fprintf(stdout, "\t# Timestep size: %lf\n", integrate.dt); fprintf(stdout, "\t# Thermo frequency: %i\n", thermo.nstat); fprintf(stdout, "\t# Ghost Newton: %i\n", ghost_newton); fprintf(stdout, "\t# Use SSE intrinsics: %i\n", force.use_sse); fprintf(stdout, "\t# Do safe exchange: %i\n", comm.do_safeexchange); fprintf(stdout, "\t# Size of float: %i\n\n",sizeof(MMD_float)); } comm.exchange(atom); comm.borders(atom); atom.d_x->upload(); atom.d_v->upload(); //atom.d_vold->upload(); neighbor.build(atom); if (me == 0) printf("# Starting dynamics ...\n"); if (me == 0) printf("# Timestep T U P Time\n"); thermo.compute(0,atom,neighbor,force,timer,comm); force.compute(atom,neighbor,comm.me); timer.barrier_start(TIME_TOTAL); integrate.run(atom,force,neighbor,comm,thermo,timer); timer.barrier_stop(TIME_TOTAL); int natoms; MPI_Allreduce(&atom.nlocal,&natoms,1,MPI_INT,MPI_SUM,MPI_COMM_WORLD); thermo.compute(-1,atom,neighbor,force,timer,comm); if(me == 0) { double time_other=timer.array[TIME_TOTAL]-timer.array[TIME_FORCE]-timer.array[TIME_NEIGH]-timer.array[TIME_COMM]; printf("\n\n"); printf("# Performance Summary:\n"); printf("# MPI_proc OMP_threads nsteps natoms t_total t_force t_neigh t_comm t_other performance perf/thread grep_string t_extra\n"); printf("%i %i %i %i %lf %lf %lf %lf %lf %lf %lf PERF_SUMMARY %lf\n\n\n", nprocs,num_threads,integrate.ntimes,natoms, timer.array[TIME_TOTAL],timer.array[TIME_FORCE],timer.array[TIME_NEIGH],timer.array[TIME_COMM],time_other, 1.0*natoms*integrate.ntimes/timer.array[TIME_TOTAL],1.0*natoms*integrate.ntimes/timer.array[TIME_TOTAL]/nprocs/num_threads,timer.array[TIME_TEST]); } if(yaml_output) output(in,atom,force,neighbor,comm,thermo,integrate,timer,screen_yaml); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); delete opencl; return 0; }