int launch_turbine(MPI_Comm comm, char* cmd, int argc, char** argv) { int status = 0; char** argvc = (char**)malloc((argc+1)*sizeof(char*)); int i; for(i=0; i<argc; i++) { argvc[i] = argv[i]; } argvc[argc] = NULL; MPI_Info info; MPI_Info_create(&info); MPI_Info_set(info,"launcher","turbine"); MPIX_Comm_launch(cmd, argvc, info, 0, comm, &status); MPI_Info_free(&info); free(argvc); if(comm != MPI_COMM_SELF) { MPI_Comm_free(&comm); } return status; }
/* parse the file-of-hints. Format is zero or more lines of "<key> <value>\n". * A # in collumn zero is a comment and the line will be ignored. Do our best * to ignore badly formed lines too. * * The caller provides an 'info' object. Each key-value pair found by the * parser will get added to the info object. any keys already set will be left * alone on the assumption that the caller knows best. * * because MPI-IO hints are optional, we can get away with limited error * reporting. */ static int file_to_info(int fd, MPI_Info info) { char *buffer, *token, *key, *val, *garbage; char *pos1, *pos2; int flag, ret; char dummy; struct stat statbuf; /* assumption: config files will be small (less than 1MB) */ fstat(fd, &statbuf); /* add 1 to size to make room for NULL termination */ buffer = (char *)calloc(statbuf.st_size + 1, sizeof (char)); if (buffer == NULL) return -1; ret = read(fd, buffer, statbuf.st_size); if (ret < 0) return -1; token = strtok_r(buffer, "\n", &pos1); do { if ( (key = strtok_r(token, " \t", &pos2)) == NULL) /* malformed line: found no items */ continue; if (token[0] == '#') /* ignore '#'-delimited comments */ continue; if ( (val = strtok_r(NULL, " \t", &pos2)) == NULL) /* malformed line: found key without value */ continue; if ( (garbage = strtok_r(NULL, " \t", &pos2)) != NULL) /* malformed line: more than two items */ continue; #ifdef SYSHINT_DEBUG printf("found: key=%s val=%s\n", key, val); #endif /* don't actually care what the value is. only want to know if key * exists: we leave it alone if so*/ MPI_Info_get(info, key, 0, &dummy, &flag); if (flag == 1) continue; MPI_Info_set(info, key, val); } while ((token = strtok_r(NULL, "\n", &pos1)) != NULL); free(buffer); return 0; }
void oshmpi_allock(MPI_Comm comm) { MPI_Info lock_info=MPI_INFO_NULL; MPI_Info_create(&lock_info); /* We define the sheap size to be symmetric and assume it for the global static data. */ MPI_Info_set(lock_info, "same_size", "true"); MPI_Win_allocate (4 * sizeof (int), sizeof (int), lock_info, comm, &oshmpi_lock_base, &oshmpi_lock_win); oshmpi_lock_base[NEXT_DISP] = -1; oshmpi_lock_base[PREV_DISP] = -1; oshmpi_lock_base[TAIL_DISP] = -1; oshmpi_lock_base[LOCK_DISP] = -1; MPI_Win_lock_all (TAIL, oshmpi_lock_win); MPI_Info_free(&lock_info); return; }
long * allocate_memory (int me, MPI_Win * win) { long * msg_buffer; long * win_base ; /* base */ MPI_Info info; MPI_Info_create(&info); MPI_Info_set(info, "same_size", "true"); MPI_Alloc_mem((MAX_MSG_SZ * ITERS_LARGE) * sizeof(long), MPI_INFO_NULL, &msg_buffer); MPI_Win_allocate((MAX_MSG_SZ * ITERS_LARGE) * sizeof(long), sizeof(long), info, MPI_COMM_WORLD, &win_base, win); MPI_Win_lock_all (MPI_MODE_NOCHECK, *win); MPI_Info_free(&info); if (NULL == msg_buffer && MPI_BOTTOM == win_base) { fprintf(stderr, "Failed to allocate window (pe: %d)\n", me); exit(EXIT_FAILURE); } return msg_buffer; }
int MpiCommunicator::init( int minId, long thecomm_ ) { VT_FUNC_I( "MpiCommunicator::init" ); assert( sizeof(thecomm_) >= sizeof(MPI_Comm) ); MPI_Comm thecomm = (MPI_Comm)thecomm_; // turn wait mode on for intel mpi if possible // this should greatly improve performance for intel mpi PAL_SetEnvVar( "I_MPI_WAIT_MODE", "enable", 0); int flag; MPI_Initialized( &flag ); if ( ! flag ) { int p; //!! FIXME passing NULL ptr breaks mvapich1 mpi implementation MPI_Init_thread( 0, NULL, MPI_THREAD_MULTIPLE, &p ); if( p != MPI_THREAD_MULTIPLE ) { // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC] Warning: not MPI_THREAD_MULTIPLE (" << MPI_THREAD_MULTIPLE << "), but " << p << std::endl; } } else if( thecomm == 0 ) { CNC_ABORT( "Process has already been initialized" ); } MPI_Comm myComm = MPI_COMM_WORLD; int rank; MPI_Comm parentComm; if( thecomm == 0 ) { MPI_Comm_get_parent( &parentComm ); } else { m_customComm = true; m_exit0CallOk = false; myComm = thecomm; } MPI_Comm_rank( myComm, &rank ); // father of all checks if he's requested to spawn processes: if ( rank == 0 && parentComm == MPI_COMM_NULL ) { // Ok, let's spawn the clients. // I need some information for the startup. // 1. Name of the executable (default is the current exe) const char * _tmp = getenv( "CNC_MPI_SPAWN" ); if ( _tmp ) { int nClientsToSpawn = atol( _tmp ); _tmp = getenv( "CNC_MPI_EXECUTABLE" ); std::string clientExe( _tmp ? _tmp : "" ); if( clientExe.empty() ) clientExe = PAL_GetProgname(); CNC_ASSERT( ! clientExe.empty() ); // 3. Special setting for MPI_Info: hosts const char * clientHost = getenv( "CNC_MPI_HOSTS" ); // Prepare MPI_Info object: MPI_Info clientInfo = MPI_INFO_NULL; if ( clientHost ) { MPI_Info_create( &clientInfo ); if ( clientHost ) { MPI_Info_set( clientInfo, const_cast< char * >( "host" ), const_cast< char * >( clientHost ) ); // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC " << rank << "] Set MPI_Info_set( \"host\", \"" << clientHost << "\" )\n"; } } // Now spawn the client processes: // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC " << rank << "] Spawning " << nClientsToSpawn << " MPI processes" << std::endl; int* errCodes = new int[nClientsToSpawn]; MPI_Comm interComm; int err = MPI_Comm_spawn( const_cast< char * >( clientExe.c_str() ), MPI_ARGV_NULL, nClientsToSpawn, clientInfo, 0, MPI_COMM_WORLD, &interComm, errCodes ); delete [] errCodes; if ( err ) { // can't use Speaker yet, need Channels to be inited std::cerr << "[CnC " << rank << "] Error in MPI_Comm_spawn. Skipping process spawning"; } else { MPI_Intercomm_merge( interComm, 0, &myComm ); } } // else { // No process spawning // MPI-1 situation: all clients to be started by mpiexec // myComm = MPI_COMM_WORLD; //} } if ( thecomm == 0 && parentComm != MPI_COMM_NULL ) { // I am a child. Build intra-comm to the parent. MPI_Intercomm_merge( parentComm, 1, &myComm ); } MPI_Comm_rank( myComm, &rank ); CNC_ASSERT( m_channel == NULL ); MpiChannelInterface* myChannel = new MpiChannelInterface( use_crc(), myComm ); m_channel = myChannel; int size; MPI_Comm_size( myComm, &size ); // Are we on the host or on the remote side? if ( rank == 0 ) { if( size <= 1 ) { Speaker oss( std::cerr ); oss << "Warning: no clients avabilable. Forgot to set CNC_MPI_SPAWN?"; } // ==> HOST startup: // This initializes the mpi environment in myChannel. MpiHostInitializer hostInitializer( *myChannel ); hostInitializer.init_mpi_comm( myComm ); } else { // ==> CLIENT startup: // This initializes the mpi environment in myChannel. MpiClientInitializer clientInitializer( *myChannel ); clientInitializer.init_mpi_comm( myComm ); } { Speaker oss( std::cerr ); oss << "MPI initialization complete (rank " << rank << ")."; } // MPI_Barrier( myComm ); // Now the mpi specific setup is finished. // Do the generic initialization stuff. GenericCommunicator::init( minId ); return 0; }
int main (int argc, char *argv[]){ char *x, *y, *z, *xbuf, *hbuf, *chrNames[MAXNBCHR]; int fd; off_t hsiz; struct stat st; MPI_File mpi_filed; MPI_File mpi_file_split_comm; MPI_Offset fileSize, unmapped_start, discordant_start; int num_proc, rank; int res, nbchr, i, paired, write_sam; int ierr, errorcode = MPI_ERR_OTHER; char *file_name, *output_dir; char *header; unsigned int headerSize; unsigned char threshold; size_t input_file_size; size_t unmappedSize = 0; size_t discordantSize = 0; size_t *readNumberByChr = NULL, *localReadNumberByChr = NULL; Read **reads; double time_count; double time_count1; int g_rank, g_size; MPI_Comm split_comm; //used to split communication when jobs have no reads to sort int split_rank, split_size; //after split communication we update the rank and the size double tic, toc; int compression_level; size_t fsiz, lsiz, loff; const char *sort_name; MPI_Info finfo; /* Set default values */ compression_level = 3; parse_mode = MODE_OFFSET; sort_name = "coordinate"; paired = 0; threshold = 0; write_sam = 0; /* Check command line */ while ((i = getopt(argc, argv, "c:hnpq:")) != -1) { switch(i) { case 'c': /* Compression level */ compression_level = atoi(optarg); break; case 'h': /* Usage display */ usage(basename(*argv)); return 0; case 'n': parse_mode = MODE_NAME; sort_name = "queryname"; break; case 'p': /* Paired reads */ paired = 1; break; case 'q': /* Quality threshold */ threshold = atoi(optarg); break; default: usage(basename(*argv)); return 1; } } if (argc - optind != 2) { usage(basename(*argv)); return 1; } file_name = argv[optind]; output_dir = argv[optind+1]; /* Check arguments */ res = access(file_name, F_OK|R_OK); if (res == -1) err(1, "%s", file_name); res = access(output_dir, F_OK|W_OK); if (res == -1) err(1, "%s", output_dir); /* MPI inits */ res = MPI_Init(&argc, &argv); assert(res == MPI_SUCCESS); res = MPI_Comm_rank(MPI_COMM_WORLD, &rank); assert(res == MPI_SUCCESS); res = MPI_Comm_size(MPI_COMM_WORLD, &num_proc); assert(res == MPI_SUCCESS); g_rank = rank; g_size = num_proc; /* Small summary */ if (rank == 0) { fprintf(stderr, "Number of processes : %d\n", num_proc); fprintf(stderr, "Reads' quality threshold : %d\n", threshold); fprintf(stderr, "Compression Level is : %d\n", compression_level); fprintf(stderr, "SAM file to read : %s\n", file_name); fprintf(stderr, "Output directory : %s\n", output_dir); } /* Process input file */ fd = open(file_name, O_RDONLY, 0666); assert(fd != -1); assert(fstat(fd, &st) != -1); xbuf = mmap(NULL, (size_t)st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0); assert(xbuf != MAP_FAILED); /* Parse SAM header */ memset(chrNames, 0, sizeof(chrNames)); x = xbuf; nbchr = 0; while (*x == '@') { y = strchr(x, '\n'); z = x; x = y + 1; if (strncmp(z, "@SQ", 3) != 0) continue; /* Save reference names */ y = strstr(z, "SN:"); assert(y != NULL); z = y + 3; while (*z && !isspace((unsigned char)*z)) z++; chrNames[nbchr++] = strndup(y + 3, z - y - 3); assert(nbchr < MAXNBCHR - 2); } chrNames[nbchr++] = strdup(UNMAPPED); chrNames[nbchr++] = strdup(DISCORDANT); hsiz = x - xbuf; hbuf = strndup(xbuf, hsiz); if (rank == 0) { fprintf(stderr, "The size of the file is %zu bytes\n", (size_t)st.st_size); fprintf(stderr, "Header has %d+2 references\n", nbchr - 2); } asprintf(&header, "@HD\tVN:1.0\tSO:%s\n%s", sort_name, hbuf); free(hbuf); assert(munmap(xbuf, (size_t)st.st_size) != -1); assert(close(fd) != -1); //task FIRST FINE TUNING FINFO FOR READING OPERATIONS MPI_Info_create(&finfo); /* * In this part you shall adjust the striping factor and unit according * to the underlying filesystem. * Harmless for other file system. * */ MPI_Info_set(finfo,"striping_factor", STRIPING_FACTOR); MPI_Info_set(finfo,"striping_unit", STRIPING_UNIT); //2G striping MPI_Info_set(finfo,"ind_rd_buffer_size", STRIPING_UNIT); //2gb buffer MPI_Info_set(finfo,"romio_ds_read",DATA_SIEVING_READ); /* * for collective reading and writing * should be adapted too and tested according to the file system * Harmless for other file system. */ MPI_Info_set(finfo,"nb_proc", NB_PROC); MPI_Info_set(finfo,"cb_nodes", CB_NODES); MPI_Info_set(finfo,"cb_block_size", CB_BLOCK_SIZE); MPI_Info_set(finfo,"cb_buffer_size", CB_BUFFER_SIZE); //we open the input file ierr = MPI_File_open(MPI_COMM_WORLD, file_name, MPI_MODE_RDONLY , finfo, &mpi_filed); //assert(in != -1); if (ierr){ if (rank == 0) fprintf(stderr, "%s: Failed to open file in process 0 %s\n", argv[0], argv[1]); MPI_Abort(MPI_COMM_WORLD, errorcode); exit(2); } ierr = MPI_File_get_size(mpi_filed, &fileSize); assert(ierr == MPI_SUCCESS); input_file_size = (long long)fileSize; /* Get chunk offset and size */ fsiz = input_file_size; lsiz = fsiz / num_proc; loff = rank * lsiz; tic = MPI_Wtime(); headerSize = unmappedSize = discordantSize = strlen(header); //We place file offset of each process to the begining of one read's line size_t *goff =(size_t*)calloc((size_t)(num_proc+1), sizeof(size_t)); init_goff(mpi_filed,hsiz,input_file_size,num_proc,rank,goff); //We calculate the size to read for each process lsiz = goff[rank+1]-goff[rank]; //NOW WE WILL PARSE size_t j=0; size_t poffset = goff[rank]; //Current offset in file sam //nbchr because we add the discordant reads in the structure reads = (Read**)malloc((nbchr)*sizeof(Read));//We allocate a linked list of struct for each Chromosome (last chr = unmapped reads) readNumberByChr = (size_t*)malloc((nbchr)*sizeof(size_t));//Array with the number of reads found in each chromosome localReadNumberByChr = (size_t*)malloc((nbchr)*sizeof(size_t));//Array with the number of reads found in each chromosome Read ** anchor = (Read**)malloc((nbchr)*sizeof(Read));//Pointer on the first read of each chromosome //Init first read for(i = 0; i < (nbchr); i++){ reads[i] = malloc(sizeof(Read)); reads[i]->coord = 0; anchor[i] = reads[i]; readNumberByChr[i]=0; } toc = MPI_Wtime(); char *local_data_tmp = malloc(1024*1024); char *local_data =(char*)malloc(((goff[rank+1]-poffset)+1)*sizeof(char)); size_t size_tmp= goff[rank+1]-poffset; local_data[goff[rank+1]-poffset] = 0; char *q=local_data; //We read the file sam and parse while(poffset < goff[rank+1]){ size_t size_to_read = 0; if( (goff[rank+1]-poffset) < DEFAULT_INBUF_SIZE ){ size_to_read = goff[rank+1]-poffset; } else{ size_to_read = DEFAULT_INBUF_SIZE; } // we load the buffer //hold temporary size of SAM //due to limitation in MPI_File_read_at local_data_tmp =(char*)realloc(local_data_tmp, (size_to_read+1)*sizeof(char)); local_data_tmp[size_to_read]=0; // Original reading part is before 18/09/2015 MPI_File_read_at(mpi_filed, (MPI_Offset)poffset, local_data_tmp, size_to_read, MPI_CHAR, MPI_STATUS_IGNORE); size_t local_offset=0; assert(strlen(local_data_tmp) == size_to_read); //we look where is the last line read for updating next poffset size_t offset_last_line = size_to_read-1; size_t extra_char=0; while(local_data_tmp[offset_last_line] != '\n'){ offset_last_line -- ; extra_char++; } local_data_tmp[size_to_read - extra_char]=0; size_t local_data_tmp_sz = strlen(local_data_tmp); //If it s the last line of file, we place a last '\n' for the function tokenizer if(rank == num_proc-1 && ((poffset+size_to_read) == goff[num_proc])){ local_data_tmp[offset_last_line]='\n'; } //Now we parse Read in local_data parser_paired(local_data_tmp, rank, poffset, threshold, nbchr, &readNumberByChr, chrNames, &reads); //now we copy local_data_tmp in local_data char *p = local_data_tmp; int pos =0; while (*p && (pos < local_data_tmp_sz)) {*q=*p;p++;q++;pos++;} //we go to the next line poffset+=(offset_last_line+1); local_offset+=(offset_last_line+1); } assert(size_tmp == strlen(local_data)); fprintf(stderr, "%d (%.2lf)::::: *** FINISH PARSING FILE ***\n", rank, MPI_Wtime()-toc); if (local_data_tmp) free(local_data_tmp); malloc_trim(0); MPI_Barrier(MPI_COMM_WORLD); //We set attribute next of the last read and go back to first read of each chromosome for(i = 0; i < nbchr; i++){ reads[i]->next = NULL; reads[i] = anchor[i]; } free(anchor); //We count how many reads we found size_t nb_reads_total =0,nb_reads_global =0; for(j=0;j<nbchr;j++){ nb_reads_total+=readNumberByChr[j]; } MPI_Allreduce(&nb_reads_total, &nb_reads_global, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD); /* * We care for unmapped and discordants reads */ int s = 0; for (s = 1; s < 3; s++){ MPI_File mpi_file_split_comm2; double time_count; size_t total_reads = 0; MPI_Allreduce(&readNumberByChr[nbchr-s], &total_reads , 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD); if ((rank == 0) && (s == 1)) fprintf(stderr, "rank %d :::: total read to sort for unmapped = %zu \n", rank, total_reads); if ((rank == 0) && (s == 2)) fprintf(stderr, "rank %d :::: total read to sort for discordant = %zu \n", rank, total_reads); MPI_Barrier(MPI_COMM_WORLD); if (total_reads == 0){ // nothing to sort for unmapped // maybe write an empty bam file } else{ int i1,i2; size_t *localReadsNum_rank0 = (size_t *)malloc(num_proc*sizeof(size_t)); localReadsNum_rank0[0] = 0; int file_pointer_to_free = 0; int split_comm_to_free = 0; //we build a vector with rank job int val_tmp1 = 0; int val_tmp2 = 0; int chosen_rank = 0; // the color tells in what communicator the rank pertain // color = 0 will be the new communicator color // otherwise the color is 1 int *color_vec_to_send = (int *)malloc(num_proc*sizeof(int)); // the key value tell the order in the new communicator int *key_vec_to_send = (int *)malloc(num_proc*sizeof(int)); //rank 0 gather the vector MPI_Allgather(&readNumberByChr[nbchr-s] , 1, MPI_LONG_LONG_INT, localReadsNum_rank0 , 1, MPI_LONG_LONG_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (rank == 0){ //we must chose the first rank with reads to sort i1=0; while (localReadsNum_rank0[i1] == 0){ chosen_rank++; i1++; } } //we broadcast the chosen rank //task: replace the broadcast with a sendrecieve MPI_Bcast( &chosen_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); //we must chose which rank is going to split the communication if (((rank == chosen_rank) || rank == 0) && (chosen_rank != 0)){ //the rank 0 will recieve the key_vec_to_send and colorvec_to_send //first we exchange the size o if (rank == chosen_rank){ header=(char *)malloc((headerSize + 1)*sizeof(char)); MPI_Recv(header, headerSize + 1, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } if (rank == 0){ MPI_Send(header, headerSize + 1, MPI_CHAR, chosen_rank, 0, MPI_COMM_WORLD); } } else { //we do nothing here } if (rank == chosen_rank) { int counter = 0; //we compute the number of 0 in the localReadsNum_vec for(i1 = 0; i1 < num_proc; i1++){ if (localReadsNum_rank0[i1] == 0) { counter++; } } // if no jobs without reads we do nothing if ( counter == 0 ){ // nothing to do we associate split_comm with split_comm = MPI_COMM_WORLD; for (i2 = 0; i2 < num_proc; i2++) { if (localReadsNum_rank0[i2] == 0) { color_vec_to_send[i2] = 1; key_vec_to_send[i2] = val_tmp2; val_tmp2++; } else { color_vec_to_send[i2] = 0; key_vec_to_send[i2] = val_tmp1; val_tmp1++; } } } else{ // now we compute the color according to // the number of reads to sort for(i2 = 0; i2 < num_proc; i2++){ if (localReadsNum_rank0[i2] == 0){ color_vec_to_send[i2] = 1; key_vec_to_send[i2] = val_tmp2; val_tmp2++; } else{ color_vec_to_send[i2] = 0; key_vec_to_send[i2] = val_tmp1; val_tmp1++; } } // end for loop }// end if }// end if (rank == chosen_rank) MPI_Barrier(MPI_COMM_WORLD); // we scatter the key and color vector // we create key and color variable for each job int local_color = 0; int local_key = 0; // we scatter the color and key MPI_Scatter( color_vec_to_send, 1, MPI_INT, &local_color, 1, MPI_INT, chosen_rank, MPI_COMM_WORLD); MPI_Scatter( key_vec_to_send, 1, MPI_INT, &local_key, 1, MPI_INT, chosen_rank, MPI_COMM_WORLD); // we create a communicator // we group all communicator // with color of zero if (local_color == 0){ MPI_Comm_split( MPI_COMM_WORLD, local_color, local_key, &split_comm); ierr = MPI_File_open(split_comm, file_name, MPI_MODE_RDONLY , finfo, &mpi_file_split_comm2); //we ask to liberate file pointer file_pointer_to_free = 1; //we ask to liberate the split_comm split_comm_to_free = 1; } else{ MPI_Comm_split( MPI_COMM_WORLD, MPI_UNDEFINED, local_key, &split_comm); mpi_file_split_comm2 = mpi_filed; } //now we change the rank in the reads structure if (local_color == 0){ MPI_Comm_rank(split_comm, &split_rank); MPI_Comm_size(split_comm, &split_size); g_rank = split_rank; g_size = split_size; reads[nbchr-s] = reads[nbchr-s]->next; localReadNumberByChr[nbchr-s] = readNumberByChr[nbchr-s]; if (s == 2){ unmapped_start = startOffset(g_rank, g_size, unmappedSize, headerSize, nbchr-s, localReadNumberByChr[nbchr-s], split_comm ); if(!unmapped_start){ fprintf(stderr, "No header was defined for unmapped. \n Shutting down.\n"); MPI_Finalize(); return 0; } time_count = MPI_Wtime(); writeSam_discordant_and_unmapped( split_rank, output_dir, header, localReadNumberByChr[nbchr-s], chrNames[nbchr-s], reads[nbchr-s], split_size, split_comm, file_name, mpi_file_split_comm2, finfo, compression_level, local_data, goff[rank], write_sam); if (split_rank == chosen_rank){ fprintf(stderr, "rank %d :::::[MPISORT] Time to write chromosom %s , %f seconds \n\n\n", split_rank, chrNames[nbchr-s], MPI_Wtime() - time_count); } } else{ discordant_start = startOffset(g_rank, g_size, discordantSize, headerSize, nbchr-s, localReadNumberByChr[nbchr-s], split_comm); if(!discordant_start){ fprintf(stderr, "No header was defined for discordant.\n Shutting down.\n"); MPI_Finalize(); return 0; } time_count = MPI_Wtime(); writeSam_discordant_and_unmapped( g_rank, output_dir, header, localReadNumberByChr[nbchr-s], chrNames[nbchr-s], reads[nbchr-s], g_size, split_comm, file_name, mpi_file_split_comm2, finfo, compression_level, local_data, goff[rank], write_sam ); if (split_rank == chosen_rank){ fprintf(stderr, "rank %d :::::[MPISORT] Time to write chromosom %s , %f seconds \n\n\n", split_rank, chrNames[nbchr-s], MPI_Wtime() - time_count); } } while( reads[nbchr-s]->next != NULL){ Read *tmp_chr = reads[nbchr-s]; reads[nbchr-s] = reads[nbchr-s]->next; free(tmp_chr); } free(localReadsNum_rank0); } else{ // we do nothing } //we put a barrier before freeing pointers MPI_Barrier(MPI_COMM_WORLD); //we free the file pointer if (file_pointer_to_free) MPI_File_close(&mpi_file_split_comm2); //we free the split_comm if (split_comm_to_free) MPI_Comm_free(&split_comm); split_comm_to_free = 0; file_pointer_to_free = 0; free(color_vec_to_send); free(key_vec_to_send); } } //end for (s=1; s < 3; s++){ /* * We write the mapped reads in a file named chrX.bam * We loop by chromosoms. */ MPI_Barrier(MPI_COMM_WORLD); for(i = 0; i < (nbchr-2); i++){ /* * First Part of the algorithm * * In this part we elected a rank which is the first rank * to have reads to sort. * * Once elected a rank, we plit the communicator according to * wether the rank has reads to sort for this chromosom. * * The new communicator is COMM_WORLD. * * If all jobs have reads to sort no need to split the communicator and then * COMM_WORLD = MPI_COMM_WORLD * */ int i1,i2; size_t localReadsNum_rank0[num_proc]; localReadsNum_rank0[0]=0; int file_pointer_to_free = 0; int split_comm_to_free = 0; //we build a vector with rank job int val_tmp1 = 0; int val_tmp2 = 0; int chosen_rank = 0; //needed to tell what rank is going to compute the color and key int chosen_split_rank= 0; //the rank that collect data once the communication splitted normally this rank is 0 // the color tells in what communicator the rank pertain // color = 0 will be the new communicator color // otherwise the color is 1 // the key value tell the order in the new communicator int *color_vec_to_send = malloc(num_proc * sizeof(int)); int *key_vec_to_send = malloc(num_proc * sizeof(int)); // first we test if the there's reads to sort // rank 0 recieve the sum of all the reads count size_t total_reads_by_chr = 0; MPI_Allreduce(&readNumberByChr[i], &total_reads_by_chr, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD); //fprintf(stderr, "rank %d :::: readNumberByChr[i] = %zu \n", rank, readNumberByChr[i]); //fprintf(stderr, "rank %d :::: total_reads_by_chr = %zu \n", rank, total_reads_by_chr); if (total_reads_by_chr == 0) continue; //pass to next chromosome //rank 0 gather the vector MPI_Allgather(&readNumberByChr[i] , 1, MPI_LONG_LONG_INT, localReadsNum_rank0 , 1, MPI_LONG_LONG_INT, MPI_COMM_WORLD); if (rank == 0){ //the rank 0 chose the first rank with reads to sort i1=0; while ((localReadsNum_rank0[i1] == 0) && (i1 < num_proc)){ chosen_rank++; i1++; } fprintf(stderr, "rank %d :::: Elected rank = %d \n", rank, chosen_rank); } //we broadcast the chosen rank //task: replace the broadcast with a sendrecieve MPI_Bcast( &chosen_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (((rank == chosen_rank) || rank == 0) && (chosen_rank != 0)){ //first we exchange the size o if (rank == chosen_rank){ header = malloc((headerSize + 1)*sizeof(char)); header[headerSize] = '\0'; MPI_Recv(header, headerSize + 1, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } if (rank == 0){ MPI_Send(header, headerSize + 1, MPI_CHAR, chosen_rank, 0, MPI_COMM_WORLD); } } else { //we do nothing here } MPI_Barrier(MPI_COMM_WORLD); if (rank == chosen_rank) { int counter = 0; //we compute the number of 0 in the localReadsNum_vec for(i1 = 0; i1 < num_proc; i1++){ if (localReadsNum_rank0[i1] == 0) { counter++; } } // if no jobs without reads we do nothing if ( counter == 0 ){ // nothing to do we associate split_comm with fprintf(stderr, "rank %d ::::[MPISORT] we don't split the rank \n", rank); split_comm = MPI_COMM_WORLD; for (i2 = 0; i2 < num_proc; i2++) { if (localReadsNum_rank0[i2] == 0) { color_vec_to_send[i2] = 1; key_vec_to_send[i2] = val_tmp2; val_tmp2++; } else { color_vec_to_send[i2] = 0; key_vec_to_send[i2] = val_tmp1; val_tmp1++; } } } else{ // now we compute the color according to // the number of reads to sort fprintf(stderr, "rank %d ::::[MPISORT] we split the rank \n", rank); for(i2 = 0; i2 < num_proc; i2++){ if (localReadsNum_rank0[i2] == 0){ color_vec_to_send[i2] = 1; key_vec_to_send[i2] = val_tmp2; val_tmp2++; } else{ color_vec_to_send[i2] = 0; key_vec_to_send[i2] = val_tmp1; val_tmp1++; } } // end for loop }// end if }// end if (rank == plit_rank) MPI_Barrier(MPI_COMM_WORLD); //we create key and color variable for each job int local_color = 0; int local_key = 0; // rank 0 scatter the color and the key vector MPI_Scatter( color_vec_to_send, 1, MPI_INT, &local_color, 1, MPI_INT, chosen_rank, MPI_COMM_WORLD); MPI_Scatter( key_vec_to_send, 1, MPI_INT, &local_key, 1, MPI_INT, chosen_rank, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); // now we create a communicator // we group all communicator // with color of zero if (local_color == 0){ MPI_Comm_split( MPI_COMM_WORLD, local_color, local_key, &split_comm); ierr = MPI_File_open(split_comm, file_name, MPI_MODE_RDONLY, finfo, &mpi_file_split_comm); //we ask to liberate file pointer file_pointer_to_free = 1; //we ask to liberate the split_comm split_comm_to_free = 1; } else{ MPI_Comm_split( MPI_COMM_WORLD, MPI_UNDEFINED, local_key, &split_comm); mpi_file_split_comm = mpi_filed; } //now we change the rank in the reads structure if (local_color == 0){ MPI_Comm_rank(split_comm, &split_rank); MPI_Comm_size(split_comm, &split_size); //we update g_rank g_rank = split_rank; g_size = split_size; } else{ g_rank = split_rank; g_size = split_size = num_proc; } localReadNumberByChr[i] = readNumberByChr[i]; MPI_Barrier(MPI_COMM_WORLD); if ((local_color == 0) && (i < (nbchr - 2))) { /* * Second part of the algorithm * * First we load coordinates, offset sources, and read size in vector * * Then we sort the coordinates of the reads * with a bitonic sorter * * Then according to the reads coordinates we reoder the offset sources, and size * this is done thanks to the index of the sorting. * * Afterward we compute the offsets of the reads in * the destination file. * * Finally we dispatch the information to all ranks * in the communicator for the next step. */ //we do a local merge sort if(reads[i] && reads[i]->next && reads[i]->next->next){ mergeSort(reads[i], readNumberByChr[i]); } size_t local_readNum = localReadNumberByChr[i]; reads[i] = reads[i]->next; //first we compute the dimension of the parabitonic sort // dimension is the number of processors where we // perform the bitonic sort // int dimensions = (int)(log2(num_processes)); // find next ( must be greater) power, and go one back int dimensions = 1; while (dimensions <= split_size) dimensions <<= 1; dimensions >>= 1; // we get the maximum number of reads among // all the workers /* * Here we split the programm in 2 cases * * 1) The first case de split_size is a power of 2 (the best case) * this case is the simpliest we don't have extra communication to dispatch the read * envenly between the jobs * * 2) The split_size is not a power of 2 (the worst case) * well in this case we shall dispatch the jobs between jobs evenly. * */ if (split_rank == chosen_split_rank){ fprintf(stderr, "Rank %d :::::[MPISORT] Dimensions for bitonic = %d \n", split_rank, dimensions); fprintf(stderr, "Rank %d :::::[MPISORT] Split size = %d \n", split_rank, split_size); } //we test the computed dimension if (dimensions == split_size ){ size_t max_num_read = 0; MPI_Allreduce(&localReadNumberByChr[i], &max_num_read, 1, MPI_LONG_LONG_INT, MPI_MAX, split_comm); // if the dimension == split_size MPI_Barrier(split_comm); size_t first_local_readNum = local_readNum; /* * Vector creation and allocation fprintf(stderr, "split rank %d :::::[MPISORT] max_num_read = %zu \n", split_rank, max_num_read); */ local_readNum = max_num_read; time_count = MPI_Wtime(); size_t *local_reads_coordinates_unsorted = calloc(local_readNum, sizeof(size_t)); size_t *local_reads_coordinates_sorted = calloc(local_readNum, sizeof(size_t)); size_t *local_offset_source_unsorted = calloc(local_readNum, sizeof(size_t)); size_t *local_offset_source_sorted = calloc(local_readNum, sizeof(size_t)); int *local_dest_rank_sorted = calloc(local_readNum, sizeof(int)); int *local_reads_sizes_unsorted = calloc(local_readNum, sizeof(int)); int *local_reads_sizes_sorted = calloc(local_readNum, sizeof(int)); int *local_source_rank_unsorted = calloc(local_readNum, sizeof(int)); int *local_source_rank_sorted = calloc(local_readNum, sizeof(int)); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT][MALLOC 1] time spent = %f s\n", split_rank, MPI_Wtime() - time_count); local_reads_coordinates_unsorted[0] = 0; local_reads_coordinates_sorted[0] = 0; local_dest_rank_sorted[0] = 0; local_reads_sizes_unsorted[0] = 0; local_reads_sizes_sorted[0] = 0; local_source_rank_unsorted[0] = 0; local_source_rank_sorted[0] = 0; local_offset_source_unsorted[0] = 0; local_offset_source_sorted[0] = 0; //those vectors are the same that local_..._sorted but without zero padding size_t *local_reads_coordinates_sorted_trimmed = NULL; int *local_dest_rank_sorted_trimmed = NULL; int *local_reads_sizes_sorted_trimmed = NULL; size_t *local_offset_source_sorted_trimmed = NULL; size_t *local_offset_dest_sorted_trimmed = NULL; int *local_source_rank_sorted_trimmed = NULL; //vectors used in the bruck just after the parabitonic sort size_t *local_reads_coordinates_sorted_trimmed_for_bruck = NULL; int *local_dest_rank_sorted_trimmed_for_bruck = NULL; int *local_reads_sizes_sorted_trimmed_for_bruck = NULL; size_t *local_offset_source_sorted_trimmed_for_bruck = NULL; size_t *local_offset_dest_sorted_trimmed_for_bruck = NULL; int *local_source_rank_sorted_trimmed_for_bruck = NULL; //task Init offset and size for source - free chr // from mpiSort_utils.c get_coordinates_and_offset_source_and_size_and_free_reads( split_rank, local_source_rank_unsorted, local_reads_coordinates_unsorted, local_offset_source_unsorted, local_reads_sizes_unsorted, reads[i], first_local_readNum ); //init indices for qksort size_t *coord_index = (size_t*)malloc(local_readNum*sizeof(size_t)); for(j = 0; j < local_readNum; j++){ coord_index[j] = j; } //To start we sort locally the reads coordinates. //this is to facilitate the bitonic sorting //if the local coordinates to sort are to big we could get rid of //this step. time_count = MPI_Wtime(); base_arr2 = local_reads_coordinates_unsorted; qksort(coord_index, local_readNum, sizeof(size_t), 0, local_readNum - 1, compare_size_t); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT][LOCAL SORT] time spent = %f s\n", split_rank, MPI_Wtime() - time_count); //We index data for(j = 0; j < local_readNum; j++){ local_reads_coordinates_sorted[j] = local_reads_coordinates_unsorted[coord_index[j]]; local_source_rank_sorted[j] = local_source_rank_unsorted[coord_index[j]]; local_reads_sizes_sorted[j] = local_reads_sizes_unsorted[coord_index[j]]; local_offset_source_sorted[j] = local_offset_source_unsorted[coord_index[j]]; local_dest_rank_sorted[j] = rank; //will be updated after sorting the coordinates } /* * FOR DEBUG * for(j = 0; j < local_readNum - 1; j++){ assert( local_reads_coordinates_sorted[j] < local_reads_coordinates_sorted[j+1]); } */ free(coord_index); //ok free(local_source_rank_unsorted); //ok free(local_reads_coordinates_unsorted); //ok free(local_reads_sizes_unsorted); //ok free(local_offset_source_unsorted); //ok // we need the total number of reads. size_t total_num_read = 0; MPI_Allreduce(&localReadNumberByChr[i], &total_num_read, 1, MPI_LONG_LONG_INT, MPI_SUM, split_comm); /* * * In this section the number of bitonic dimension * is equal to the split size. * * In this case there are less communication in preparation * of the sorting. * * We use the parabitonic version 2. */ //we calll the bitonic time_count = MPI_Wtime(); ParallelBitonicSort2( split_comm, split_rank, dimensions, local_reads_coordinates_sorted, local_reads_sizes_sorted, local_source_rank_sorted, local_offset_source_sorted, local_dest_rank_sorted, max_num_read ); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT][BITONIC 2] time spent = %f s\n", split_rank, MPI_Wtime() - time_count); size_t k1; size_t tmp2 = 0; for (k1 = 1; k1 < max_num_read; k1++){ assert(local_reads_coordinates_sorted[k1-1] <= local_reads_coordinates_sorted[k1]); local_dest_rank_sorted[k1]= split_rank; } /* for (k1 = 0; k1 < max_num_read; k1++){ fprintf(stderr, "rank %d :::::[MPISORT][BITONIC 2] local_reads_coordinates_sorted[%zu]= %zu s\n", split_rank, k1, local_reads_coordinates_sorted[k1]); fprintf(stderr, "rank %d :::::[MPISORT][BITONIC 2] local_source_rank_sorted[%zu]= %d s\n", split_rank, k1, local_source_rank_sorted[k1]); } */ size_t *local_offset_dest_sorted = malloc(max_num_read*sizeof(size_t)); size_t last_local_offset = 0; // We compute the local_dest_offsets_sorted size_t local_total_offset = 0; for (k1 = 0; k1 < max_num_read; k1++){ local_offset_dest_sorted[k1] = local_reads_sizes_sorted[k1]; local_total_offset += local_reads_sizes_sorted[k1]; } //we make the cumulative sum of all offsets for (k1 = 1; k1 < max_num_read; k1++){ local_offset_dest_sorted[k1] = local_offset_dest_sorted[k1 - 1] + local_offset_dest_sorted[k1]; } //we exchange the last destination offset last_local_offset = local_offset_dest_sorted[max_num_read-1]; //number of block to send int blocksize = 1; MPI_Offset *y = calloc(split_size, sizeof(MPI_Offset)); MPI_Offset *y2 = calloc(split_size + 1, sizeof(MPI_Offset)); //we wait all processors MPI_Gather(&last_local_offset, 1, MPI_LONG_LONG_INT, y, 1, MPI_LONG_LONG_INT, 0, split_comm); if (split_rank ==0){ for (k1 = 1; k1 < (split_size + 1); k1++) { y2[k1] = y[k1-1]; } } if (split_rank ==0){ for (k1 = 1; k1 < (split_size +1); k1++) { y2[k1] = y2[k1-1] + y2[k1]; } } size_t offset_to_add = 0; MPI_Scatter(y2, 1, MPI_LONG_LONG_INT, &offset_to_add, 1, MPI_LONG_LONG_INT, 0, split_comm); free(y); free(y2); //we add offset of the previous rank for (k1 = 0; k1 < max_num_read; k1++){ if (local_reads_sizes_sorted[k1] != 0) local_offset_dest_sorted[k1] += offset_to_add; else local_offset_dest_sorted[k1] = 0; } /* for (k1 = 0; k1 < max_num_read; k1++){ fprintf(stderr, "\n"); fprintf(stderr, "rank %d :::::[MPISORT][BITONIC 2] local_reads_coordinates_sorted[%zu]= %zu s\n", split_rank, k1, local_reads_coordinates_sorted[k1]); fprintf(stderr, "rank %d :::::[MPISORT][BITONIC 2] local_source_rank_sorted[%zu]= %d s\n", split_rank, k1, local_source_rank_sorted[k1]); fprintf(stderr, "rank %d :::::[MPISORT][BITONIC 2] local_offset_dest_sorted[%zu]= %d s\n", split_rank, k1, local_offset_dest_sorted[k1]); fprintf(stderr, "\n"); } */ /* * we update destination rank according to * original number of reads read. * */ //we compute the new rank dest according to max_num_read size_t previous_num_reads_per_job[dimensions]; //we create a vector of size split_size with previous reads per job MPI_Allgather(&first_local_readNum , 1, MPI_LONG_LONG_INT, previous_num_reads_per_job , 1, MPI_LONG_LONG_INT, split_comm); // we compute the position of of the read in the first // reference without the zero padding of bitonic size_t pos_ref0 = 0; //we need the number of zeros we add for the padding size_t N0 = max_num_read*dimensions - total_num_read; int new_rank = 0; int previous_rank = 0; // we compute the new rank for // the reads sorted by offset destination size_t h = 0; pos_ref0 = max_num_read*split_rank - N0; for(j = 0; j < max_num_read; j++) { if ( local_reads_sizes_sorted[j] != 0){ int new_rank = chosen_split_rank; pos_ref0 = (max_num_read*split_rank +j) - N0; if (pos_ref0 >= 0) { size_t tmp2 = 0; for (h = 0; h < dimensions; h++){ tmp2 += previous_num_reads_per_job[h]; if ( pos_ref0 < tmp2) { new_rank = h; break; } } previous_rank = local_dest_rank_sorted[j]; local_dest_rank_sorted[j] = new_rank; } } } MPI_Barrier(split_comm); size_t offset = 0; size_t numItems = 0; size_t num_read_for_bruck = 0; int *p = local_reads_sizes_sorted; if (p[0] != 0) {offset = 0;}; if (p[max_num_read -1] == 0){offset = max_num_read;} else {while ((*p == 0) && (offset < max_num_read )){ offset++; p++;}} /* * REMOVE ZERO PADDING BEFORE BRUCK * */ time_count = MPI_Wtime(); if (offset > 0){ // we remove zeros in the vector we have 2 cases // the first offset < max_num_read // and the entire vector is null if ( offset < max_num_read ){ numItems = max_num_read - offset; local_reads_coordinates_sorted_trimmed_for_bruck = malloc(numItems * sizeof(size_t)); local_offset_source_sorted_trimmed_for_bruck = malloc(numItems * sizeof(size_t)); local_offset_dest_sorted_trimmed_for_bruck = malloc(numItems * sizeof(size_t)); local_reads_sizes_sorted_trimmed_for_bruck = malloc(numItems * sizeof(int)); local_dest_rank_sorted_trimmed_for_bruck = malloc(numItems * sizeof(int)); local_source_rank_sorted_trimmed_for_bruck = malloc(numItems * sizeof(int)); size_t y=0; for (y = 0; y < numItems; y++){ local_reads_coordinates_sorted_trimmed_for_bruck[y] = local_reads_coordinates_sorted[y+offset]; local_offset_source_sorted_trimmed_for_bruck[y] = local_offset_source_sorted[y+offset]; local_offset_dest_sorted_trimmed_for_bruck[y] = local_offset_dest_sorted[y+offset]; local_reads_sizes_sorted_trimmed_for_bruck[y] = local_reads_sizes_sorted[y+offset]; local_dest_rank_sorted_trimmed_for_bruck[y] = local_dest_rank_sorted[y+offset]; local_source_rank_sorted_trimmed_for_bruck[y] = local_source_rank_sorted[y+offset]; } num_read_for_bruck = numItems; /* * * FOR DEBUG * for(y = 0; y < num_read_for_bruck; y++){ assert( local_reads_sizes_sorted_trimmed_for_bruck[y] != 0 ); assert( local_source_rank_sorted_trimmed_for_bruck[y] < dimensions); assert( local_dest_rank_sorted_trimmed_for_bruck[y] < dimensions); assert( local_offset_source_sorted_trimmed_for_bruck[y] != 0); assert( local_offset_dest_sorted_trimmed_for_bruck[y] != 0); assert( local_reads_coordinates_sorted_trimmed_for_bruck[y] != 0); } */ } else{ numItems = 0; local_reads_coordinates_sorted_trimmed_for_bruck = malloc(numItems * sizeof(size_t)); local_offset_source_sorted_trimmed_for_bruck = malloc(numItems * sizeof(size_t)); local_offset_dest_sorted_trimmed_for_bruck = malloc(numItems * sizeof(size_t)); local_reads_sizes_sorted_trimmed_for_bruck = malloc(numItems * sizeof(int)); local_dest_rank_sorted_trimmed_for_bruck = malloc(numItems * sizeof(int)); local_source_rank_sorted_trimmed_for_bruck = malloc(numItems * sizeof(int)); num_read_for_bruck = 0; } } else { numItems = local_readNum; local_reads_coordinates_sorted_trimmed_for_bruck = malloc(local_readNum * sizeof(size_t)); local_offset_source_sorted_trimmed_for_bruck = malloc(local_readNum * sizeof(size_t)); local_offset_dest_sorted_trimmed_for_bruck = malloc(local_readNum * sizeof(size_t)); local_reads_sizes_sorted_trimmed_for_bruck = malloc(local_readNum * sizeof(int)); local_dest_rank_sorted_trimmed_for_bruck = malloc(local_readNum * sizeof(int)); local_source_rank_sorted_trimmed_for_bruck = malloc(local_readNum * sizeof(int)); size_t y=0; for (y = 0; y < local_readNum; y++){ local_reads_coordinates_sorted_trimmed_for_bruck[y] = local_reads_coordinates_sorted[y]; local_offset_source_sorted_trimmed_for_bruck[y] = local_offset_source_sorted[y]; local_offset_dest_sorted_trimmed_for_bruck[y] = local_offset_dest_sorted[y]; local_reads_sizes_sorted_trimmed_for_bruck[y] = local_reads_sizes_sorted[y]; local_dest_rank_sorted_trimmed_for_bruck[y] = local_dest_rank_sorted[y]; local_source_rank_sorted_trimmed_for_bruck[y] = local_source_rank_sorted[y]; } num_read_for_bruck = numItems; /* * * FOR DEBUG * for(y = 0; y < num_read_for_bruck; y++){ assert( local_reads_sizes_sorted_trimmed_for_bruck[y] != 0 ); assert( local_source_rank_sorted_trimmed_for_bruck[y] < dimensions); assert( local_dest_rank_sorted_trimmed_for_bruck[y] < dimensions); assert( local_offset_source_sorted_trimmed_for_bruck[y] != 0); assert( local_offset_dest_sorted_trimmed_for_bruck[y] != 0); assert( local_reads_coordinates_sorted_trimmed_for_bruck[y] != 0); } */ } free(local_reads_coordinates_sorted); free(local_offset_source_sorted); free(local_offset_dest_sorted); free(local_reads_sizes_sorted); free(local_dest_rank_sorted); free(local_source_rank_sorted); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT][TRIMMING] time spent = %f s\n", split_rank, MPI_Wtime() - time_count); /* * We do a Bruck on rank of origin reading */ size_t m=0; int num_proc = dimensions; size_t *number_of_reads_by_procs = calloc( dimensions, sizeof(size_t)); //fprintf(stderr, "rank %d :::::[MPISORT] num_read_for_bruck = %zu \n", split_rank, num_read_for_bruck); for(m = 0; m < num_read_for_bruck; m++){ //assert(new_pbs_orig_rank_off_phase1[m] < dimensions); //assert(new_pbs_dest_rank_phase1[m] < dimensions); number_of_reads_by_procs[local_source_rank_sorted_trimmed_for_bruck[m]]++; } int *local_source_rank_sorted_trimmed_for_bruckv2 = malloc( num_read_for_bruck * sizeof(int)); for(m = 0; m < num_read_for_bruck; m++){ local_source_rank_sorted_trimmed_for_bruckv2[m] = local_source_rank_sorted_trimmed_for_bruck[m]; } size_t count6 = 0; for(m = 0; m < dimensions; m++){ count6 += number_of_reads_by_procs[m]; } assert( count6 == num_read_for_bruck ); MPI_Barrier(split_comm); size_t **reads_coordinates = malloc(sizeof(size_t *) * dimensions); size_t **local_source_offsets = malloc(sizeof(size_t *) * dimensions); size_t **dest_offsets = malloc(sizeof(size_t *) * dimensions); int **read_size = malloc(sizeof(int *) * dimensions); int **dest_rank = malloc(sizeof(int *) * dimensions); int **source_rank = malloc(sizeof(int *) * dimensions); /* * We send in order * * local_offset_source_sorted_trimmed_for_bruck * local_dest_rank_sorted_trimmed_for_bruck * local_reads_coordinates_sorted_trimmed_for_bruck * local_reads_sizes_sorted_trimmed_for_bruck * */ COMM_WORLD = split_comm; time_count = MPI_Wtime(); bruckWrite3(split_rank, dimensions, count6, number_of_reads_by_procs, local_source_rank_sorted_trimmed_for_bruckv2, local_offset_source_sorted_trimmed_for_bruck, //offset sources &local_source_offsets, local_dest_rank_sorted_trimmed_for_bruck, //destination rank &dest_rank, local_reads_coordinates_sorted_trimmed_for_bruck, //reads coordinates &reads_coordinates, local_reads_sizes_sorted_trimmed_for_bruck, //read size &read_size, local_source_rank_sorted_trimmed_for_bruck, //source rank &source_rank, local_offset_dest_sorted_trimmed_for_bruck, &dest_offsets ); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT][BRUCK 3] time spent = %f s\n", split_rank, MPI_Wtime() - time_count); time_count = MPI_Wtime(); free(local_reads_coordinates_sorted_trimmed_for_bruck); free(local_dest_rank_sorted_trimmed_for_bruck); free(local_reads_sizes_sorted_trimmed_for_bruck); free(local_offset_source_sorted_trimmed_for_bruck); free(local_offset_dest_sorted_trimmed_for_bruck); free(local_source_rank_sorted_trimmed_for_bruck); free(local_source_rank_sorted_trimmed_for_bruckv2); local_reads_coordinates_sorted_trimmed = malloc(first_local_readNum * sizeof(size_t)); local_offset_source_sorted_trimmed = malloc(first_local_readNum * sizeof(size_t)); local_offset_dest_sorted_trimmed = malloc(first_local_readNum * sizeof(size_t)); local_dest_rank_sorted_trimmed = malloc(first_local_readNum * sizeof(int)); local_source_rank_sorted_trimmed = malloc(first_local_readNum * sizeof(int)); local_reads_sizes_sorted_trimmed = malloc(first_local_readNum * sizeof(int)); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT][FREE + MALLOC] time spent = %f s\n", split_rank, MPI_Wtime() - time_count); /* * GET DATA AFTER BRUCK * */ j=0; size_t k = 0; for(m = 0; m < num_proc; m++) { for(k = 0; k < number_of_reads_by_procs[m]; k++) { local_offset_dest_sorted_trimmed[k + j] = dest_offsets[m][k]; local_dest_rank_sorted_trimmed[k + j] = dest_rank[m][k]; local_reads_sizes_sorted_trimmed[k + j] = read_size[m][k]; local_offset_source_sorted_trimmed[k + j] = local_source_offsets[m][k]; local_reads_coordinates_sorted_trimmed[k + j] = reads_coordinates[m][k]; local_source_rank_sorted_trimmed[k + j] = source_rank[m][k]; } free(dest_offsets[m]); free(dest_rank[m]); free(read_size[m]); free(local_source_offsets[m]); free(reads_coordinates[m]); free(source_rank[m]); j += number_of_reads_by_procs[m]; } free(number_of_reads_by_procs); if (dest_rank != NULL) free(dest_rank); if (read_size != NULL) free(read_size); if (local_source_offsets != NULL) free(local_source_offsets); if (reads_coordinates != NULL) free(reads_coordinates); if (source_rank != NULL) free(source_rank); if (dest_offsets != NULL) free(dest_offsets); local_readNum = first_local_readNum; /* * * FOR DEBUG * for ( j = 0; j < local_readNum; j++){ assert ( local_reads_coordinates_sorted_trimmed[j] != 0 ); assert ( local_offset_source_sorted_trimmed[j] != 0 ); assert ( local_offset_dest_sorted_trimmed[j] != 0 ); assert ( local_reads_sizes_sorted_trimmed != 0 ); assert ( local_dest_rank_sorted_trimmed[j] < split_size ); assert ( local_source_rank_sorted_trimmed[j] < split_size ); } */ free(local_reads_coordinates_sorted_trimmed); if (split_rank == chosen_split_rank) fprintf(stderr, "rank %d :::::[MPISORT] we call write SAM \n", split_rank); malloc_trim(0); time_count = MPI_Wtime(); writeSam( split_rank, output_dir, header, local_readNum, total_reads_by_chr, chrNames[i], reads[i], split_size, split_comm, chosen_split_rank, file_name, mpi_file_split_comm, finfo, compression_level, local_offset_dest_sorted_trimmed, local_offset_source_sorted_trimmed, local_reads_sizes_sorted_trimmed, local_dest_rank_sorted_trimmed, local_source_rank_sorted_trimmed, local_data, goff[rank], first_local_readNum ); if (split_rank == chosen_split_rank){ fprintf(stderr, "rank %d :::::[MPISORT][WRITESAM] chromosom %s ::: %f seconds\n\n\n", split_rank, chrNames[i], MPI_Wtime() - time_count); } } else{ /* * We are in the case the number of cpu is * not a power of 2 * * */ parallel_sort_any_dim( dimensions, //dimension for parabitonic local_readNum, split_rank, split_size, reads, i, //chromosom number chosen_split_rank, split_comm, localReadNumberByChr, local_data, file_name, output_dir, finfo, compression_level, total_reads_by_chr, goff[rank], headerSize, header, chrNames[i], mpi_file_split_comm ); } //end if dimensions < split_rank } //if ((local_color == 0) && (i < (nbchr - 2))) //in the splitted dimension else{ //we do nothing here } //we put a barrier before freeing pointers MPI_Barrier(MPI_COMM_WORLD); //we free the file pointer if (file_pointer_to_free) MPI_File_close(&mpi_file_split_comm); //we free the split_comm if (split_comm_to_free){ MPI_Comm_free(&split_comm); } free(color_vec_to_send); free(key_vec_to_send); }// end loop upon chromosoms (line 665)
int main(int argc, char *argv[]) { int i, j, k, length, my_rank, left, right, size, test_value, mid; double start, finish, transfer_time; float snd_buf_left[max_length], snd_buf_right[max_length]; float *rcv_buf_left, *rcv_buf_right; float *rcv_buf_left_neighbor, *rcv_buf_right_neighbor; MPI_Win win_rcv_buf_left, win_rcv_buf_right; MPI_Info info_noncontig; MPI_Aint buf_size; int disp_unit; /* Naming conventions */ /* Processes: */ /* my_rank-1 my_rank my_rank+1 */ /* "left neighbor" "myself" "right neighbor" */ /* ... rcv_buf_right <--- snd_buf_left snd_buf_right ---> rcv_buf_left ... */ /* ... snd_buf_right ---> rcv_buf_left rcv_buf_right <--- snd_buf_left ... */ /* | | */ /* halo-communication halo-communication */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &size); right = (my_rank+1) % size; left = (my_rank-1+size) % size; MPI_Info_create(&info_noncontig); MPI_Info_set(info_noncontig, "alloc_shared_noncontig", "true"); MPI_Win_allocate_shared((MPI_Aint)(max_length*sizeof(float)), sizeof(float), info_noncontig, MPI_COMM_WORLD, &rcv_buf_left, &win_rcv_buf_left ); MPI_Win_allocate_shared((MPI_Aint)(max_length*sizeof(float)), sizeof(float), info_noncontig, MPI_COMM_WORLD, &rcv_buf_right, &win_rcv_buf_right); /*... shared memory access to the rcv_buf_left, of the RIGHT neighbor process */ MPI_Win_shared_query(win_rcv_buf_left, right, &buf_size, &disp_unit, &rcv_buf_left_neighbor ); /*... shared memory access to the rcv_buf_right, of the LEFT neighbor process */ MPI_Win_shared_query(win_rcv_buf_right, left, &buf_size, &disp_unit, &rcv_buf_right_neighbor); if (my_rank == 0) printf(" message size transfertime duplex bandwidth per process and neighbor\n"); length = start_length; for (j = 1; j <= number_package_sizes; j++) { for (i = 0; i <= number_of_messages; i++) { if(i==1) start = MPI_Wtime(); test_value = j*1000000 + i*10000 + my_rank*10 ; mid = (length-1)/number_of_messages*i; snd_buf_left[0]=test_value+1 ; snd_buf_left[mid]=test_value+2 ; snd_buf_left[length-1]=test_value+3; snd_buf_right[0]=test_value+6 ; snd_buf_right[mid]=test_value+7 ; snd_buf_right[length-1]=test_value+8; /* MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPRECEDE, win_rcv_buf_left ); */ /* MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPRECEDE, win_rcv_buf_right); */ /* ... instead of above, work-around for a bug with shared memory windows in some libraries: */ MPI_Win_fence(MPI_MODE_NOSTORE, win_rcv_buf_left ); MPI_Win_fence(MPI_MODE_NOSTORE, win_rcv_buf_right); /* MPI_Put(snd_buf_left, length, MPI_FLOAT, left, (MPI_Aint)0, length, MPI_FLOAT, win_rcv_buf_right); */ /* MPI_Put(snd_buf_right, length, MPI_FLOAT, right, (MPI_Aint)0, length, MPI_FLOAT, win_rcv_buf_left ); */ /* ... is substited by: */ for(k=0; k<length; k++) rcv_buf_right_neighbor[k] = snd_buf_left [k]; for(k=0; k<length; k++) rcv_buf_left_neighbor [k] = snd_buf_right[k]; /* MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT + MPI_MODE_NOSUCCEED, win_rcv_buf_left ); */ /* MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT + MPI_MODE_NOSUCCEED, win_rcv_buf_right); */ /* ... instead of above, work-around for a bug with shared memory windows in some libraries: */ MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT, win_rcv_buf_left ); MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT, win_rcv_buf_right); /* ...snd_buf_... is used to store the values that were stored in snd_buf_... in the neighbor process */ test_value = j*1000000 + i*10000 + left*10 ; mid = (length-1)/number_of_messages*i; snd_buf_right[0]=test_value+6 ; snd_buf_right[mid]=test_value+7 ; snd_buf_right[length-1]=test_value+8; test_value = j*1000000 + i*10000 + right*10 ; mid = (length-1)/number_of_messages*i; snd_buf_left[0]=test_value+1 ; snd_buf_left[mid]=test_value+2 ; snd_buf_left[length-1]=test_value+3; if ((rcv_buf_left[0] != snd_buf_right[0]) || (rcv_buf_left[mid] != snd_buf_right[mid]) || (rcv_buf_left[length-1] != snd_buf_right[length-1])) { printf("%d: j=%d, i=%d --> snd_buf_right[0,%d,%d]=(%f,%f,%f)\n", my_rank, j, i, mid, length-1, snd_buf_right[0], snd_buf_right[mid], snd_buf_right[length-1]); printf("%d: is not identical to rcv_buf_left[0,%d,%d]=(%f,%f,%f)\n", my_rank, mid, length-1, rcv_buf_left[0], rcv_buf_left[mid], rcv_buf_left[length-1]); } if ((rcv_buf_right[0] != snd_buf_left[0]) || (rcv_buf_right[mid] != snd_buf_left[mid]) || (rcv_buf_right[length-1] != snd_buf_left[length-1])) { printf("%d: j=%d, i=%d --> snd_buf_left[0,%d,%d]=(%f,%f,%f)\n", my_rank, j, i, mid, length-1, snd_buf_left[0], snd_buf_left[mid], snd_buf_left[length-1]); printf("%d: is not identical to rcv_buf_right[0,%d,%d]=(%f,%f,%f)\n", my_rank, mid, length-1, rcv_buf_right[0], rcv_buf_right[mid], rcv_buf_right[length-1]); } } finish = MPI_Wtime(); if (my_rank == 0) { transfer_time = (finish - start) / number_of_messages; printf("%10i bytes %12.3f usec %13.3f MB/s\n", length*(int)sizeof(float), transfer_time*1e6, 1.0e-6*2*length*sizeof(float) / transfer_time); } length = length * length_factor; } MPI_Win_free(&win_rcv_buf_left ); MPI_Win_free(&win_rcv_buf_right); MPI_Finalize(); }
int main(int argc, char** argv) { char filename[256]; int i, j, rank, nprocs, err, nerrs=0, expected; int ncid, cmode, varid[2], dimid[2], req[4], st[4], *buf; int *buf0, *buf1, *buf2; size_t len; MPI_Offset start[2], count[2]; MPI_Info info; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* this program is intended to run on one process */ if (rank) goto fn_exit; /* get command-line arguments */ if (argc > 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); MPI_Finalize(); return 1; } if (argc == 2) snprintf(filename, 256, "%s", argv[1]); else strcpy(filename, "testfile.nc"); if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); sprintf(cmd_str, "*** TESTING C %s for writing interleaved fileviews ", basename(argv[0])); printf("%-66s ------ ", cmd_str); free(cmd_str); } MPI_Info_create(&info); MPI_Info_set(info, "romio_cb_write", "disable"); MPI_Info_set(info, "ind_wr_buffer_size", "8"); /* these 2 hints are required to cause a core dump if r1758 fix is not * presented */ /* create a new file for writing ----------------------------------------*/ cmode = NC_CLOBBER | NC_64BIT_DATA; err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_ERR MPI_Info_free(&info); /* define dimensions Y and X */ err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR /* define 2D variables of integer type */ err = ncmpi_def_var(ncid, "var0", NC_INT, 2, dimid, &varid[0]); CHECK_ERR err = ncmpi_def_var(ncid, "var1", NC_INT, 2, dimid, &varid[1]); CHECK_ERR /* enable fill mode */ err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR /* do not forget to exit define mode */ err = ncmpi_enddef(ncid); CHECK_ERR /* now we are in data mode */ buf = (int*) malloc(NY*NX * sizeof(int)); /* fill the entire variable var0 with -1s */ for (i=0; i<NY*NX; i++) buf[i] = -1; err = ncmpi_put_var_int_all(ncid, varid[0], buf); CHECK_ERR /* write 8 x 2 elements so this only interleaves the next two * iput requests */ start[0] = 0; start[1] = 3; count[0] = 8; count[1] = 2; len = (size_t)(count[0] * count[1]); buf0 = (int*) malloc(len * sizeof(int)); for (i=0; i<len; i++) buf0[i] = 50+i; err = ncmpi_iput_vara_int(ncid, varid[0], start, count, buf0, &req[0]); CHECK_ERR /* write 1 x 3 elements */ start[0] = 1; start[1] = 8; count[0] = 1; count[1] = 5; len = (size_t)(count[0] * count[1]); buf1 = (int*) malloc(len * sizeof(int)); for (i=0; i<len; i++) buf1[i] = 60+i; err = ncmpi_iput_vara_int(ncid, varid[0], start, count, buf1, &req[1]); CHECK_ERR /* write 1 x 3 elements */ start[0] = 3; start[1] = 7; count[0] = 1; count[1] = 5; len = (size_t)(count[0] * count[1]); buf2 = (int*) malloc(len * sizeof(int)); for (i=0; i<len; i++) buf2[i] = 70+i; err = ncmpi_iput_vara_int(ncid, varid[0], start, count, buf2, &req[2]); CHECK_ERR err = ncmpi_wait_all(ncid, 3, req, st); CHECK_ERR free(buf0); free(buf1); free(buf2); /* fill the entire variable var1 with -1s */ for (i=0; i<NY*NX; i++) buf[i] = -1; err = ncmpi_put_var_int_all(ncid, varid[1], buf); CHECK_ERR /* write 8 x 2 elements so this only interleaves the next two iput * requests */ start[0] = 0; start[1] = 3; count[0] = 8; count[1] = 2; len = (size_t)(count[0] * count[1]); buf0 = (int*) malloc(len * sizeof(int)); for (i=0; i<count[0]*count[1]; i++) buf0[i] = 50+i; err = ncmpi_iput_vara_int(ncid, varid[1], start, count, buf0, &req[0]); CHECK_ERR /* rearrange buffer contents, as buf is 2D */ for (i=0; i<5; i++) buf[i] = 10 + i; for (i=5; i<10; i++) buf[i] = 10 + i + 5; for (i=10; i<15; i++) buf[i] = 10 + i + 10; start[0] = 6; start[1] = 7; count[0] = 3; count[1] = 5; err = ncmpi_iput_vara_int(ncid, varid[1], start, count, buf, &req[1]); CHECK_ERR for (i=15; i<20; i++) buf[i] = 10 + i - 10; for (i=20; i<25; i++) buf[i] = 10 + i - 5; start[0] = 6; start[1] = 12; count[0] = 2; count[1] = 5; err = ncmpi_iput_vara_int(ncid, varid[1], start, count, buf+15, &req[2]); CHECK_ERR for (i=25; i<30; i++) buf[i] = 10 + i; start[0] = 8; start[1] = 12; count[0] = 1; count[1] = 5; err = ncmpi_iput_vara_int(ncid, varid[1], start, count, buf+25, &req[3]); CHECK_ERR err = ncmpi_wait_all(ncid, 4, req, st); CHECK_ERR /* check if write buffer contents have been altered */ for (i=0; i<16; i++) CHECK_CONTENTS(buf0, 50 + i) for (i=0; i<5; i++) CHECK_CONTENTS(buf, 10 + i) for (i=5; i<10; i++) CHECK_CONTENTS(buf, 10 + i + 5) for (i=10; i<15; i++) CHECK_CONTENTS(buf, 10 + i + 10) for (i=15; i<20; i++) CHECK_CONTENTS(buf, 10 + i - 10) for (i=20; i<25; i++) CHECK_CONTENTS(buf, 10 + i - 5) for (i=25; i<30; i++) CHECK_CONTENTS(buf, 10 + i) err = ncmpi_close(ncid); CHECK_ERR free(buf0); /* open the same file and read back for validate */ err = ncmpi_open(MPI_COMM_SELF, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR err = ncmpi_inq_varid(ncid, "var0", &varid[0]); CHECK_ERR err = ncmpi_inq_varid(ncid, "var1", &varid[1]); CHECK_ERR /* read the entire array */ for (i=0; i<NY*NX; i++) buf[i] = -1; err = ncmpi_get_var_int_all(ncid, varid[0], buf); CHECK_ERR /* check if the contents of buf are expected */ expected = 50; for (j=0; j<8; j++) { for (i=3; i<5; i++) { if (buf[j*NX+i] != expected) { printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n", rank, j, i, buf[j*NX+i], expected); nerrs++; } expected++; } } expected = 60; j = 1; for (i=8; i<13; i++) { if (buf[j*NX+i] != expected) { printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n", rank, j, i, buf[j*NX+i], expected); nerrs++; } expected++; } expected = 70; j = 3; for (i=7; i<12; i++) { if (buf[j*NX+i] != expected) { printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n", rank, j, i, buf[j*NX+i], expected); nerrs++; } expected++; } /* initialize the contents of the array to a different value */ for (i=0; i<NY*NX; i++) buf[i] = -1; /* read the entire array */ err = ncmpi_get_var_int_all(ncid, varid[1], buf); CHECK_ERR /* check if the contents of buf are expected */ expected = 10; for (j=6; j<9; j++) { for (i=7; i<17; i++) { if (buf[j*NX+i] != expected) { printf("%d: Unexpected read buf[%d]=%d, should be %d\n", rank, i, buf[j*NX+i], expected); nerrs++; } expected++; } } expected = 50; for (j=0; j<8; j++) { for (i=3; i<5; i++) { if (buf[j*NX+i] != expected) { printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n", rank, j, i, buf[j*NX+i], expected); nerrs++; } expected++; } } err = ncmpi_close(ncid); CHECK_ERR free(buf); /* check if PnetCDF freed all internal malloc */ MPI_Offset malloc_size; err = ncmpi_inq_malloc_size(&malloc_size); if (err == NC_NOERR && malloc_size > 0) { printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", malloc_size); ncmpi_inq_malloc_list(); } fn_exit: MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); else printf(PASS_STR); } MPI_Finalize(); return (nerrs > 0); }
int test_file(char *filename, int mynod, int nprocs, char * cb_hosts, const char *msg, int verbose) { MPI_Datatype typevec, newtype, t[3]; int *buf, i, b[3], errcode, errors=0; MPI_File fh; MPI_Aint d[3]; MPI_Status status; int SIZE = (STARTING_SIZE/nprocs)*nprocs; MPI_Info info; if (mynod==0 && verbose) fprintf(stderr, "%s\n", msg); buf = (int *) malloc(SIZE*sizeof(int)); if (buf == NULL) { perror("test_file"); MPI_Abort(MPI_COMM_WORLD, -1); } if (cb_hosts != NULL ) { MPI_Info_create(&info); MPI_Info_set(info, "cb_config_list", cb_hosts); } else { info = MPI_INFO_NULL; } MPI_Type_vector(SIZE/nprocs, 1, nprocs, MPI_INT, &typevec); b[0] = b[1] = b[2] = 1; d[0] = 0; d[1] = mynod*sizeof(int); d[2] = SIZE*sizeof(int); t[0] = MPI_LB; t[1] = typevec; t[2] = MPI_UB; MPI_Type_struct(3, b, d, t, &newtype); MPI_Type_commit(&newtype); MPI_Type_free(&typevec); if (!mynod) { if(verbose) fprintf(stderr, "\ntesting noncontiguous in memory, noncontiguous in file using collective I/O\n"); MPI_File_delete(filename, info); } MPI_Barrier(MPI_COMM_WORLD); errcode = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh); if (errcode != MPI_SUCCESS) { handle_error(errcode, "MPI_File_open"); } MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info); for (i=0; i<SIZE; i++) buf[i] = SEEDER(mynod,i,SIZE); errcode = MPI_File_write_all(fh, buf, 1, newtype, &status); if (errcode != MPI_SUCCESS) { handle_error(errcode, "nc mem - nc file: MPI_File_write_all"); } MPI_Barrier(MPI_COMM_WORLD); for (i=0; i<SIZE; i++) buf[i] = -1; errcode = MPI_File_read_at_all(fh, 0, buf, 1, newtype, &status); if (errcode != MPI_SUCCESS) { handle_error(errcode, "nc mem - nc file: MPI_File_read_at_all"); } /* the verification for N compute nodes is tricky. Say we have 3 * processors. * process 0 sees: 0 -1 -1 3 -1 -1 ... * process 1 sees: -1 34 -1 -1 37 -1 ... * process 2 sees: -1 -1 68 -1 -1 71 ... */ /* verify those leading -1s exist if they should */ for (i=0; i<mynod; i++ ) { if ( buf[i] != -1 ) { if(verbose) fprintf(stderr, "Process %d: buf is %d, should be -1\n", mynod, buf[i]); errors++; } } /* now the modulo games are hairy. processor 0 sees real data in the 0th, * 3rd, 6th... elements of the buffer (assuming nprocs==3 ). proc 1 sees * the data in 1st, 4th, 7th..., and proc 2 sees it in 2nd, 5th, 8th */ for(/* 'i' set in above loop */; i<SIZE; i++) { if ( ((i-mynod)%nprocs) && buf[i] != -1) { if(verbose) fprintf(stderr, "Process %d: buf %d is %d, should be -1\n", mynod, i, buf[i]); errors++; } if ( !((i-mynod)%nprocs) && buf[i] != SEEDER(mynod,i,SIZE) ) { if(verbose) fprintf(stderr, "Process %d: buf %d is %d, should be %d\n", mynod, i, buf[i], SEEDER(mynod,i,SIZE)); errors++; } } MPI_File_close(&fh); MPI_Barrier(MPI_COMM_WORLD); if (!mynod) { if(verbose) fprintf(stderr, "\ntesting noncontiguous in memory, contiguous in file using collective I/O\n"); MPI_File_delete(filename, info); } MPI_Barrier(MPI_COMM_WORLD); MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh); for (i=0; i<SIZE; i++) buf[i] = SEEDER(mynod,i,SIZE); errcode = MPI_File_write_at_all(fh, mynod*(SIZE/nprocs)*sizeof(int), buf, 1, newtype, &status); if (errcode != MPI_SUCCESS) handle_error(errcode, "nc mem - c file: MPI_File_write_at_all"); MPI_Barrier(MPI_COMM_WORLD); for (i=0; i<SIZE; i++) buf[i] = -1; errcode = MPI_File_read_at_all(fh, mynod*(SIZE/nprocs)*sizeof(int), buf, 1, newtype, &status); if (errcode != MPI_SUCCESS) handle_error(errcode, "nc mem - c file: MPI_File_read_at_all"); /* just like as above */ for (i=0; i<mynod; i++ ) { if ( buf[i] != -1 ) { if(verbose) fprintf(stderr, "Process %d: buf is %d, should be -1\n", mynod, buf[i]); errors++; } } for(/* i set in above loop */; i<SIZE; i++) { if ( ((i-mynod)%nprocs) && buf[i] != -1) { if(verbose) fprintf(stderr, "Process %d: buf %d is %d, should be -1\n", mynod, i, buf[i]); errors++; } if ( !((i-mynod)%nprocs) && buf[i] != SEEDER(mynod,i,SIZE)) { if(verbose) fprintf(stderr, "Process %d: buf %d is %d, should be %d\n", mynod, i, buf[i], SEEDER(mynod,i,SIZE) ); errors++; } } MPI_File_close(&fh); MPI_Barrier(MPI_COMM_WORLD); if (!mynod) { if(verbose) fprintf(stderr, "\ntesting contiguous in memory, noncontiguous in file using collective I/O\n"); MPI_File_delete(filename, info); } MPI_Barrier(MPI_COMM_WORLD); MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh); MPI_File_set_view(fh, 0, MPI_INT, newtype, "native", info); for (i=0; i<SIZE; i++) buf[i] = SEEDER(mynod, i, SIZE); errcode = MPI_File_write_all(fh, buf, SIZE, MPI_INT, &status); if (errcode != MPI_SUCCESS) handle_error(errcode, "c mem - nc file: MPI_File_write_all"); MPI_Barrier(MPI_COMM_WORLD); for (i=0; i<SIZE; i++) buf[i] = -1; errcode = MPI_File_read_at_all(fh, 0, buf, SIZE, MPI_INT, &status); if (errcode != MPI_SUCCESS) handle_error(errcode, "c mem - nc file: MPI_File_read_at_all"); /* same crazy checking */ for (i=0; i<SIZE; i++) { if (buf[i] != SEEDER(mynod, i, SIZE)) { if(verbose) fprintf(stderr, "Process %d: buf %d is %d, should be %d\n", mynod, i, buf[i], SEEDER(mynod, i, SIZE)); errors++; } } MPI_File_close(&fh); MPI_Type_free(&newtype); free(buf); if (info != MPI_INFO_NULL) MPI_Info_free(&info); return errors; }
void do_collective_read() { MPI_Info info; MPI_Datatype contig; MPI_Comm sub_read_comm; MPI_File fh; char coll_path[PATH_LEN]; int sub_comm_size, sub_rank, sub_comm_color; int disp; int rc; int *buf; ptimes[0].start = MPI_Wtime(); ptimes[1].start = MPI_Wtime(); sub_comm_color = get_sub_collective_io_comm(&sub_read_comm); /* Construct a datatype for distributing the input data across all * processes. */ MPI_Type_contiguous(data_size / sizeof(int), MPI_INT, &contig); MPI_Type_commit(&contig); /* Set the stripe_count and stripe_size, that is, the striping_factor * and striping_unit. Both keys and values for MPI_Info_set must be * in the form of ascii strings. */ MPI_Info_create(&info); // MPI_Info_set(info, "striping_factor", striping_factor); // MPI_Info_set(info, "striping_unit", striping_unit); MPI_Info_set(info, "romio_cb_read", "enable"); // MPI_Info_set(info, "romio_cb_read", "disable"); /* Get path to the target file of the communicator */ MPI_Comm_size(sub_read_comm, &sub_comm_size); get_coll_io_path(coll_path, sub_comm_color); /* Delete the output file if it exists so that striping can be set * on the output file. */ // rc = MPI_File_delete(coll_path, info); /* Create read data*/ MPI_Comm_rank(sub_read_comm, &sub_rank); buf = create_io_data(-1); ptimes[1].end = MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); /* Open the file */ ptimes[2].start = MPI_Wtime(); rc = MPI_File_open(sub_read_comm, coll_path, MPI_MODE_RDONLY, info, &fh); if (rc != MPI_SUCCESS) { gio_err("MPI_File_open failed: %s (%s:%s:%d)", coll_path, __FILE__, __func__, __LINE__); } ptimes[2].end = MPI_Wtime(); /* Set the file view for the output file. In this example, we will * use the same contiguous datatype as we used for reading the data * into local memory. A better example would be to read out just * part of the data, say 4 contiguous elements followed by a gap of * 4 elements, and repeated. */ ptimes[3].start = MPI_Wtime(); #ifdef GIO_LARGE_FILE int i; for (i = 0; i < sub_rank; i++) { MPI_File_seek(fh, data_size, MPI_SEEK_CUR); } #else disp = sub_rank * data_size; MPI_File_set_view(fh, disp, contig, contig, "native", info); #endif if (rc != MPI_SUCCESS) { gio_err("MPI_File_set_view failed (%s:%s:%d)", __FILE__, __func__, __LINE__); } ptimes[3].end = MPI_Wtime(); /* MPI Collective Read */ ptimes[4].start = MPI_Wtime(); rc = MPI_File_read_all(fh, buf, 1, contig, MPI_STATUS_IGNORE); if (rc != MPI_SUCCESS) { gio_err("MPI_File_set_view failed (%s:%s:%d)", __FILE__, __func__, __LINE__); } ptimes[4].end = MPI_Wtime(); validate_io_data(buf, sub_rank); /*Free data*/ free_io_data(buf); /* Close Files */ ptimes[5].start = MPI_Wtime(); MPI_File_close(&fh); ptimes[5].end = MPI_Wtime(); ptimes[0].end = MPI_Wtime(); print_results(); return; }
void ADIOI_PIOFS_Open(ADIO_File fd, int *error_code) { int amode, perm, old_mask, err; piofs_fstat_t piofs_fstat; char *value; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PIOFS_OPEN"; #endif if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; #ifdef PROFILE MPE_Log_event(1, 0, "start open"); #endif fd->fd_sys = open(fd->filename, amode, perm); #ifdef PROFILE MPE_Log_event(2, 0, "end open"); #endif llseek(fd->fd_sys, 0, SEEK_SET); /* required to initiate use of 64-bit offset */ if (fd->fd_sys != -1) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); /* get file striping information and set it in info */ err = piofsioctl(fd->fd_sys, PIOFS_FSTAT, &piofs_fstat); if (!err) { sprintf(value, "%d", piofs_fstat.st_bsu); MPI_Info_set(fd->info, "striping_unit", value); sprintf(value, "%d", piofs_fstat.st_cells); MPI_Info_set(fd->info, "striping_factor", value); sprintf(value, "%d", piofs_fstat.st_base_node); MPI_Info_set(fd->info, "start_iodevice", value); } ADIOI_Free(value); if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = llseek(fd->fd_sys, 0, SEEK_END); } #ifdef PRINT_ERR_MSG *error_code = (fd->fd_sys == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (fd->fd_sys == -1) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(ADIO_FILE_NULL, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif }
/* This test spawns two child jobs and has them open a port and connect to * each other. * The two children repeatedly connect, accept, and disconnect from each other. */ int main(int argc, char *argv[]) { int error; int rank, size; int numprocs = 3; char *argv1[2] = { (char *) "connector", NULL }; char *argv2[2] = { (char *) "acceptor", NULL }; MPI_Comm comm_connector, comm_acceptor, comm_parent, comm; char port[MPI_MAX_PORT_NAME] = { 0 }; MPI_Status status; MPI_Info spawn_path = MPI_INFO_NULL; int i, num_loops = 100; int data; int verbose = 0; int can_spawn, errs = 0; if (getenv("MPITEST_VERBOSE")) { verbose = 1; } IF_VERBOSE(("init.\n")); error = MPI_Init(&argc, &argv); check_error(error, "MPI_Init"); errs += MTestSpawnPossible(&can_spawn); if (!can_spawn) { if (errs) printf(" Found %d errors\n", errs); else printf(" No Errors\n"); fflush(stdout); } else { IF_VERBOSE(("size.\n")); error = MPI_Comm_size(MPI_COMM_WORLD, &size); check_error(error, "MPI_Comm_size"); IF_VERBOSE(("rank.\n")); error = MPI_Comm_rank(MPI_COMM_WORLD, &rank); check_error(error, "MPI_Comm_rank"); if (argc == 1) { /* Make sure that the current directory is in the path. * Not all implementations may honor or understand this, but * it is highly recommended as it gives users a clean way * to specify the location of the executable without * specifying a particular directory format (e.g., this * should work with both Windows and Unix implementations) */ MPI_Info_create(&spawn_path); MPI_Info_set(spawn_path, (char *) "path", (char *) "."); IF_VERBOSE(("spawn connector.\n")); error = MPI_Comm_spawn((char *) "disconnect_reconnect2", argv1, numprocs, spawn_path, 0, MPI_COMM_WORLD, &comm_connector, MPI_ERRCODES_IGNORE); check_error(error, "MPI_Comm_spawn"); IF_VERBOSE(("spawn acceptor.\n")); error = MPI_Comm_spawn((char *) "disconnect_reconnect2", argv2, numprocs, spawn_path, 0, MPI_COMM_WORLD, &comm_acceptor, MPI_ERRCODES_IGNORE); check_error(error, "MPI_Comm_spawn"); MPI_Info_free(&spawn_path); if (rank == 0) { IF_VERBOSE(("recv port.\n")); error = MPI_Recv(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_acceptor, &status); check_error(error, "MPI_Recv"); IF_VERBOSE(("send port.\n")); error = MPI_Send(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_connector); check_error(error, "MPI_Send"); } IF_VERBOSE(("barrier acceptor.\n")); error = MPI_Barrier(comm_acceptor); check_error(error, "MPI_Barrier"); IF_VERBOSE(("barrier connector.\n")); error = MPI_Barrier(comm_connector); check_error(error, "MPI_Barrier"); error = MPI_Comm_free(&comm_acceptor); check_error(error, "MPI_Comm_free"); error = MPI_Comm_free(&comm_connector); check_error(error, "MPI_Comm_free"); if (rank == 0) { printf(" No Errors\n"); fflush(stdout); } } else if ((argc == 2) && (strcmp(argv[1], "acceptor") == 0)) { IF_VERBOSE(("get_parent.\n")); error = MPI_Comm_get_parent(&comm_parent); check_error(error, "MPI_Comm_get_parent"); if (comm_parent == MPI_COMM_NULL) { printf("acceptor's parent is NULL.\n"); fflush(stdout); MPI_Abort(MPI_COMM_WORLD, -1); } if (rank == 0) { IF_VERBOSE(("open_port.\n")); error = MPI_Open_port(MPI_INFO_NULL, port); check_error(error, "MPI_Open_port"); IF_VERBOSE(("0: opened port: <%s>\n", port)); IF_VERBOSE(("send.\n")); error = MPI_Send(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_parent); check_error(error, "MPI_Send"); } for (i = 0; i < num_loops; i++) { IF_VERBOSE(("accept.\n")); error = MPI_Comm_accept(port, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &comm); check_error(error, "MPI_Comm_accept"); if (rank == 0) { data = i; error = MPI_Send(&data, 1, MPI_INT, 0, 0, comm); check_error(error, "MPI_Send"); error = MPI_Recv(&data, 1, MPI_INT, 0, 0, comm, &status); check_error(error, "MPI_Recv"); if (data != i) { printf("expected %d but received %d\n", i, data); fflush(stdout); MPI_Abort(MPI_COMM_WORLD, 1); } } IF_VERBOSE(("disconnect.\n")); error = MPI_Comm_disconnect(&comm); check_error(error, "MPI_Comm_disconnect"); } if (rank == 0) { IF_VERBOSE(("close_port.\n")); error = MPI_Close_port(port); check_error(error, "MPI_Close_port"); } IF_VERBOSE(("barrier.\n")); error = MPI_Barrier(comm_parent); check_error(error, "MPI_Barrier"); MPI_Comm_free(&comm_parent); } else if ((argc == 2) && (strcmp(argv[1], "connector") == 0)) { IF_VERBOSE(("get_parent.\n")); error = MPI_Comm_get_parent(&comm_parent); check_error(error, "MPI_Comm_get_parent"); if (comm_parent == MPI_COMM_NULL) { printf("acceptor's parent is NULL.\n"); fflush(stdout); MPI_Abort(MPI_COMM_WORLD, -1); } if (rank == 0) { IF_VERBOSE(("recv.\n")); error = MPI_Recv(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_parent, &status); check_error(error, "MPI_Recv"); IF_VERBOSE(("1: received port: <%s>\n", port)); } for (i = 0; i < num_loops; i++) { IF_VERBOSE(("connect.\n")); error = MPI_Comm_connect(port, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &comm); check_error(error, "MPI_Comm_connect"); if (rank == 0) { data = -1; error = MPI_Recv(&data, 1, MPI_INT, 0, 0, comm, &status); check_error(error, "MPI_Recv"); if (data != i) { printf("expected %d but received %d\n", i, data); fflush(stdout); MPI_Abort(MPI_COMM_WORLD, 1); } error = MPI_Send(&data, 1, MPI_INT, 0, 0, comm); check_error(error, "MPI_Send"); } IF_VERBOSE(("disconnect.\n")); error = MPI_Comm_disconnect(&comm); check_error(error, "MPI_Comm_disconnect"); } IF_VERBOSE(("barrier.\n")); error = MPI_Barrier(comm_parent); check_error(error, "MPI_Barrier"); MPI_Comm_free(&comm_parent); } else { printf("invalid command line.\n"); fflush(stdout); { int ii; for (ii = 0; ii < argc; ii++) { printf("argv[%d] = <%s>\n", ii, argv[ii]); } } fflush(stdout); MPI_Abort(MPI_COMM_WORLD, -2); } } MPI_Finalize(); return MTestReturnValue(errs); }
int main(int argc, char ** argv) { int nprocs, mynod, errcode; options my_options = {NULL, 0, 0}; MPI_File fh; MPI_Status status; MPI_Info info; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &mynod); parse_args(argc, argv, mynod, &my_options); if (my_options.do_aggregation) { MPI_Info_create(&info); MPI_Info_set(info, "romio_no_indep_rw", "true"); MPI_Info_set(info, "cb_config_list", "leela.mcs.anl.gov:1"); } else { info = MPI_INFO_NULL; } /* create the file w/o EXCL: this must not fail */ errcode = MPI_File_open(MPI_COMM_WORLD, my_options.fname, MPI_MODE_CREATE|MPI_MODE_RDWR, info, &fh); if (errcode != MPI_SUCCESS) { handle_error(errcode, "MPI_File_open"); } errcode = MPI_File_close(&fh); if (errcode != MPI_SUCCESS) { handle_error(errcode, "MPI_File_close"); } /* now try to open w/ CREAT|EXCL: this must fail */ errcode = MPI_File_open(MPI_COMM_WORLD, my_options.fname, MPI_MODE_CREATE|MPI_MODE_EXCL|MPI_MODE_RDWR, info, &fh); if (errcode == MPI_SUCCESS) { handle_error(errcode, "MPI_File_open: expected an error: got"); } /* ignore the error: File_delete is not aggregator-aware */ MPI_File_delete(my_options.fname, info); /* this must succeed: the file no longer exists */ errcode = MPI_File_open(MPI_COMM_WORLD, my_options.fname, MPI_MODE_CREATE|MPI_MODE_EXCL|MPI_MODE_RDWR, info, &fh); if (errcode != MPI_SUCCESS) { handle_error(errcode, "MPI_File_open"); } errcode = MPI_File_close(&fh); if (errcode != MPI_SUCCESS) { handle_error(errcode, "MPI_File_close"); } if (mynod == 0) { printf(" No Errors\n"); } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, len, nkeys, flag, mynod, default_striping_factor, nprocs; MPI_File fh; MPI_Info info, info_used; char *filename, key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL]; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &mynod); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* process 0 takes the file name as a command-line argument and broadcasts it to other processes */ if (!mynod) { i = 1; while ((i < argc) && strcmp("-fname", *argv)) { i++; argv++; } if (i >= argc) { printf("\n*# Usage: file_info -fname filename\n\n"); MPI_Abort(MPI_COMM_WORLD, 1); } argv++; len = strlen(*argv); filename = (char *) malloc(len+1); strcpy(filename, *argv); MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(filename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD); } else { MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); filename = (char *) malloc(len+1); MPI_Bcast(filename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD); } /* open the file with MPI_INFO_NULL */ MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); /* check the default values set by ROMIO */ MPI_File_get_info(fh, &info_used); MPI_Info_get_nkeys(info_used, &nkeys); for (i=0; i<nkeys; i++) { MPI_Info_get_nthkey(info_used, i, key); MPI_Info_get(info_used, key, MPI_MAX_INFO_VAL-1, value, &flag); if (!mynod) printf("Process %d, Default: key = %s, value = %s\n", mynod, key, value); if (!strcmp("striping_factor", key)) default_striping_factor = atoi(value); } MPI_File_close(&fh); /* delete the file */ if (!mynod) MPI_File_delete(filename, MPI_INFO_NULL); MPI_Barrier(MPI_COMM_WORLD); /* set new info values. */ MPI_Info_create(&info); /* The following four hints are accepted on all machines. They can be specified at file-open time or later (any number of times). */ /* buffer size for collective I/O */ MPI_Info_set(info, "cb_buffer_size", "8388608"); /* number of processes that actually perform I/O in collective I/O */ sprintf(value, "%d", nprocs/2); MPI_Info_set(info, "cb_nodes", value); /* buffer size for data sieving in independent reads */ MPI_Info_set(info, "ind_rd_buffer_size", "2097152"); /* buffer size for data sieving in independent writes */ MPI_Info_set(info, "ind_wr_buffer_size", "1048576"); /* The following three hints related to file striping are accepted only on Intel PFS and IBM PIOFS file systems and are ignored elsewhere. They can be specified only at file-creation time; if specified later they will be ignored. */ /* number of I/O devices across which the file will be striped. accepted only if 0 < value < default_striping_factor; ignored otherwise */ sprintf(value, "%d", default_striping_factor-1); MPI_Info_set(info, "striping_factor", value); /* the striping unit in bytes */ MPI_Info_set(info, "striping_unit", "131072"); /* the I/O device number from which to start striping the file. accepted only if 0 <= value < default_striping_factor; ignored otherwise */ sprintf(value, "%d", default_striping_factor-2); MPI_Info_set(info, "start_iodevice", value); /* The following hint about PFS server buffering is accepted only on Intel PFS. It can be specified anytime. */ MPI_Info_set(info, "pfs_svr_buf", "true"); /* open the file and set new info */ MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_CREATE | MPI_MODE_RDWR, info, &fh); /* check the values set */ MPI_File_get_info(fh, &info_used); MPI_Info_get_nkeys(info_used, &nkeys); if (!mynod) printf("\n New values\n\n"); for (i=0; i<nkeys; i++) { MPI_Info_get_nthkey(info_used, i, key); MPI_Info_get(info_used, key, MPI_MAX_INFO_VAL-1, value, &flag); if (!mynod) printf("Process %d, key = %s, value = %s\n", mynod, key, value); } MPI_File_close(&fh); free(filename); MPI_Info_free(&info_used); MPI_Info_free(&info); MPI_Finalize(); return 0; }
/* The test write a NP * NP matrix M, NP is the number of process: put_vara: Process N write N copy of it's rank to row N ([N, 0...WIDTH]) using different APIs on different variable final result should be: 0 0 0 0 ... 1 1 1 1 ... 2 2 2 2 ... . . . */ int simpletest(char* fname, int enable_log) { int buffer[MAXPROCESSES]; MPI_Offset start[2], count[2]; int i, j, ret, errlen; int NProc, MyRank, NP; // Total process; Rank int fid; // Data set ID int did[2]; // IDs of dimension int vid; // IDs for variables int dims[2]; char tmp[1024], tmp2[1024]; MPI_Info Info; MPI_Comm_size(MPI_COMM_WORLD, &NP); MPI_Comm_rank(MPI_COMM_WORLD, &MyRank); if (NP == 1) { // Act if there is WIDTH processes for easy debugging. Most debugger supports only single processes. NProc = SINGLEPROCNP; MyRank = SINGLEPROCRANK; } else{ NProc = NP; } if (MyRank < MAXPROCESSES) { // Ensure each process have a independent buffer directory MPI_Info_create(&Info); if (enable_log) { MPI_Info_set(Info, "pnetcdf_log", "enable"); } // Create new cdf file ret = ncmpi_create(MPI_COMM_WORLD, fname, NC_CLOBBER, Info, &fid); if (ret != NC_NOERR) { printf("Error create file\n"); goto ERROR; } ret = ncmpi_set_fill(fid, NC_FILL, NULL); if (ret != NC_NOERR) { printf("Error set fill\n"); goto ERROR; } ret = ncmpi_def_dim(fid, "X", NProc, did); // X if (ret != NC_NOERR) { printf("Error def dim X\n"); goto ERROR; } ret = ncmpi_def_dim(fid, "Y", NProc, did + 1); // Y if (ret != NC_NOERR) { printf("Error def dim Y\n"); goto ERROR; } ret = ncmpi_def_var(fid, "M", NC_INT, 2, did, vid); if (ret != NC_NOERR) { printf("Error def var M\n"); goto ERROR; } ret = ncmpi_enddef(fid); if (ret != NC_NOERR) { printf("Error enddef\n"); goto ERROR; } // Indep mode ret = ncmpi_begin_indep_data(fid); if (ret != NC_NOERR) { printf("Error begin indep\n"); goto ERROR; } // We all write rank from now on for (i = 0; i < NProc; i++) { buffer[i] = MyRank; } // put_vara count[0] = 1; count[1] = NProc; start[0] = MyRank; start[1] = 0; ret = ncmpi_put_vara_int(fid, vid, start, count, buffer); if (ret != NC_NOERR) { MPI_Error_string(ret, tmp, &errlen); printf("Error put_varn: %d\n%s\n", errlen, tmp); goto ERROR; } // Collective mode ncmpi_end_indep_data(fid); if (ret != NC_NOERR) { printf("Error end indep"); goto ERROR; } ncmpi_close(fid); // Close file if (ret != NC_NOERR) { printf("Error close"); goto ERROR; } } ERROR: return 0; }
int main(int argc, char *argv[]) { int iarrayOfSizes[2], iarrayOfSubsizes[2], iarrayOfStarts[2], ilocal_size; int nproc[2], periods[2], icoord[2]; int m, n, i, j, wsize, wrank, crank, ndims, lrows, lcols, grow, gcol, err; MPI_Datatype filetype; MPI_File fh; MPI_Comm cartcomm; MPI_Info info0, info3; double t, topen, twrite, tclose, wrate; double *local_array; char nstripesStr[12], stripeUnitStr[12]; int nstripes = -1; int stripeUnit = -1; MPI_Offset headerSize = 0; MPI_Init(0,0); MPI_Comm_rank(MPI_COMM_WORLD, &wrank); /* Get global array size */ m = n = 128; /* Set default size */ /* ioda [ n ] [ m ] [ nstripes ] [ stripeunit ] [ headersize ] */ if (argc > 0) { if (argc > 1) m = atoi(argv[1]); if (argc > 2) n = atoi(argv[2]); if (argc > 3) nstripes = atoi(argv[3]); if (argc > 4) stripeUnit = atoi(argv[4]); if (argc > 5) headerSize = atoi(argv[5]); if (argc > 6) { if (wrank == 0) fprintf(stderr,"Unrecognized argument %s\n", argv[6]); MPI_Abort(MPI_COMM_WORLD,1); } } if (wrank == 0) printf("Matrix is [%d,%d]; file dir = %s\n", m, n, MYSCRATCHDIR ); /* The default number of stripes = totalsize/1M */ if (nstripes < 0) { nstripes = n * m * sizeof(double) / (1024*1024); if (nstripes < 1) nstripes = 1; } if (wrank == 0) printf("nstripes = %d, stripeUnit = %d, header size = %d\n", nstripes, stripeUnit, (int)headerSize); /* Use topology routines to get decomposition and coordinates */ MPI_Comm_size(MPI_COMM_WORLD, &wsize); nproc[0] = 0; nproc[1] = 0; ndims = 2; MPI_Dims_create(wsize, ndims, nproc); periods[0] = 0; periods[1] = 0; MPI_Cart_create(MPI_COMM_WORLD, ndims, nproc, periods, 1, &cartcomm); MPI_Comm_rank(cartcomm, &crank); MPI_Cart_coords(cartcomm, crank, ndims, icoord); iarrayOfSizes[0] = m; iarrayOfSizes[1] = n; iarrayOfSubsizes[0] = m/nproc[0]; iarrayOfSubsizes[1] = n/nproc[1]; iarrayOfStarts[0] = icoord[0] * iarrayOfSubsizes[0]; iarrayOfStarts[1] = icoord[1] * iarrayOfSubsizes[1]; /* Initialize my block of the data */ ilocal_size = iarrayOfSubsizes[0] * iarrayOfSubsizes[1]; lrows = iarrayOfSubsizes[0]; lcols = iarrayOfSubsizes[1]; local_array = (double *)malloc(lrows*lcols*sizeof(double)); gcol = iarrayOfStarts[1]; grow = iarrayOfStarts[0]; for (i=0; i<lrows; i++) { for (j=0; j<lcols; j++) { local_array[j*lrows+i] = (grow+i) + (gcol+j)*m; } } /* Fortran order simply means the data is stored by columns */ MPI_Type_create_subarray(ndims, iarrayOfSizes, iarrayOfSubsizes, iarrayOfStarts, MPI_ORDER_FORTRAN, MPI_DOUBLE, &filetype); MPI_Type_commit(&filetype); info0 = MPI_INFO_NULL; info3 = MPI_INFO_NULL; if (nstripes > 0 || stripeUnit > 0) { MPI_Info_create(&info0); if (nstripes > 0) { snprintf(nstripesStr, sizeof(nstripesStr), "%d", nstripes); MPI_Info_set(info0, "striping_factor", nstripesStr); MPI_Info_set(info0, "cb_nodes", nstripesStr); } if (stripeUnit > 0) { snprintf(stripeUnitStr, sizeof(stripeUnitStr), "%d", stripeUnit); MPI_Info_set(info0, "striping_unit", stripeUnitStr); } MPI_Info_dup(info0, &info3); MPI_Info_set(info3, "romio_no_indep_rw", "true"); /* Other hints to consider: direct_io=true The default cb_buffer_size is 16777216 , but is overridden by the striping unit, which is smaller by default. */ } /* level - 3 */ MPI_Barrier(MPI_COMM_WORLD); t = MPI_Wtime(); err = MPI_File_open(cartcomm, MYSCRATCHDIR "testfile-3.out", MPI_MODE_CREATE | MPI_MODE_RDWR, info3, &fh); topen = MPI_Wtime() - t; if (err != MPI_SUCCESS) myAbort(err, "open testfile-3.out"); if (headerSize > 0) { /* Simulate writing a header */ if (wrank == 0) { char *header; header = (char *)calloc(1,(size_t)headerSize); MPI_File_write(fh, header, headerSize, MPI_BYTE, MPI_STATUS_IGNORE); free(header); } MPI_Barrier(cartcomm); } MPI_File_set_view(fh, headerSize, MPI_DOUBLE, filetype, "native", MPI_INFO_NULL); MPI_Barrier(MPI_COMM_WORLD); t = MPI_Wtime(); err = MPI_File_write_all(fh, local_array, ilocal_size, MPI_DOUBLE, MPI_STATUS_IGNORE); twrite = MPI_Wtime() - t; if (err != MPI_SUCCESS) myAbort(err, "collective write"); err = MPI_File_close(&fh); tclose = MPI_Wtime() - t; /* tclose is the time for the write(s) + the close, in case the implementation delays (some of) the writes until the close */ if (err != MPI_SUCCESS) myAbort(err, "close testfile-3.out"); MPI_Allreduce(MPI_IN_PLACE, &topen, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &twrite, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &tclose, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if (twrite > 0) wrate = (double)m * (double)n * sizeof(double)/twrite; if (wrank == 0) printf("%d\t[%d,%d]\t%d\t%.2e\t%.2e\t%.2e\t%.2e\n", wsize, m, n, nstripes, topen, twrite, tclose, wrate); /* level - 0 */ MPI_Barrier(MPI_COMM_WORLD); t = MPI_Wtime(); err = MPI_File_open(cartcomm, MYSCRATCHDIR "testfile-0.out", MPI_MODE_CREATE | MPI_MODE_RDWR, info0, &fh); topen = MPI_Wtime() - t; if (err != MPI_SUCCESS) myAbort(err, "open testfile-0.out"); if (headerSize > 0) { /* Simulate writing a header */ if (wrank == 0) { char *header; header = (char *)calloc(1,(size_t)headerSize); MPI_File_write(fh, header, headerSize, MPI_BYTE, MPI_STATUS_IGNORE); free(header); } MPI_Barrier(cartcomm); } MPI_Barrier(MPI_COMM_WORLD); t = MPI_Wtime(); gcol = iarrayOfStarts[1]; grow = iarrayOfStarts[0]; for (j=0; j<lcols; j++) { MPI_Offset offset = headerSize + ((MPI_Offset)(grow) + (MPI_Offset)(gcol+j)*m) * sizeof(double); err = MPI_File_write_at(fh, offset, local_array+j*lrows, lrows, MPI_DOUBLE, MPI_STATUS_IGNORE); if (err != MPI_SUCCESS) myAbort(err, "write at"); } twrite = MPI_Wtime() - t; err = MPI_File_close(&fh); tclose = MPI_Wtime() - t; /* tclose is the time for the write(s) + the close, in case the implementation delays (some of) the writes until the close */ if (err != MPI_SUCCESS) myAbort(err, "close testfile-0"); MPI_Allreduce(MPI_IN_PLACE, &topen, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &twrite, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &tclose, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if (twrite > 0) wrate = (double)m * (double)n * sizeof(double)/twrite; if (wrank == 0) printf("%d\t[%d,%d]\t%d\t%.2e\t%.2e\t%.2e\t%.2e\n", wsize, m, n, nstripes, topen, twrite, tclose, wrate); if (info0 != MPI_INFO_NULL) { MPI_Info_free(&info0); MPI_Info_free(&info3); } free(local_array); MPI_Finalize(); return 0; }
void SID_init(int *argc, char **argv[], SID_args args[], void *mpi_comm_as_void){ int status; int i_level; int i_char; int flag_continue; int flag_passed_comm; // MPI-specific things #if USE_MPI int n_keys; int i_key; char key[256]; char key_value[256]; int key_exists; char nodes_string[256]; SID_fp fp_tmp; FILE *fp_hack; int node_name_length; MPI_Comm mpi_comm; #if USE_MPI_IO MPI_Info info_disp; #endif if (mpi_comm_as_void == NULL) { flag_passed_comm = 0; MPI_Init(argc,argv); MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm); } else { mpi_comm = *((MPI_Comm *) mpi_comm_as_void); flag_passed_comm = 1; } MPI_Comm_size(mpi_comm, &(SID.n_proc)); MPI_Comm_rank(mpi_comm, &(SID.My_rank)); SID.My_node =(char *)SID_malloc(SID_MAXLENGTH_PROCESSOR_NAME * sizeof(char)); #if USE_MPI MPI_Get_processor_name(SID.My_node, &node_name_length); #else sprintf(SID.My_node,"localhost"); node_name_length=strlen(SID.My_node); #endif if (node_name_length >= SID_MAXLENGTH_PROCESSOR_NAME-1) SID_trap_error("SID_MAXLENGTH_PROCESSOR_NAME needs to be increased",ERROR_LOGIC); // Make my_rank=MASTER_RANK the master if(SID.My_rank==MASTER_RANK) SID.I_am_Master=TRUE; else SID.I_am_Master=FALSE; // Identify the last rank if(SID.My_rank==SID.n_proc-1) SID.I_am_last_rank=TRUE; else SID.I_am_last_rank=FALSE; #if USE_MPI_IO // Fetch collective buffering defaults MPI_Info_create(&(SID.file_info)); if(SID.I_am_Master){ fp_hack=fopen(".tmp.SID","w+"); fclose(fp_hack); } MPI_Barrier(mpi_comm); MPI_File_open(mpi_comm, ".tmp.SID", MPI_MODE_WRONLY, MPI_INFO_NULL, &(fp_tmp.fp)); MPI_File_get_info(fp_tmp.fp,&info_disp); MPI_Info_get_nkeys(info_disp,&n_keys); for(i_key=0;i_key<n_keys;i_key++){ MPI_Info_get_nthkey(info_disp,i_key,key); MPI_Info_get(info_disp,key,MPI_MAX_INFO_VAL,key_value,&key_exists); if(key_exists) MPI_Info_set((SID.file_info),key,key_value); } MPI_File_close(&(fp_tmp.fp)); if(SID.I_am_Master) remove(".tmp.SID"); // Set user-defined colective buffering optimizations sprintf(nodes_string,"%d",MIN(SID.n_proc,N_IO_FILES_MAX)); MPI_Info_set((SID.file_info),"cb_nodes", nodes_string); MPI_Info_set((SID.file_info),"cb_config_list", "*:1"); #endif #else SID.My_rank=MASTER_RANK; SID.n_proc =1; #endif /* #if !USE_MPI_IO SID.n_groups=SID.n_proc/N_IO_FILES_MAX; if(SID.n_proc%N_IO_FILES_MAX) SID.n_groups++; SID.My_group=SID.My_rank/N_IO_FILES_MAX; #endif */ // Set ranks to the left and right SID.rank_to_right =(SID.My_rank+1)%SID.n_proc; SID.rank_to_left = SID.My_rank-1; if(SID.rank_to_left<0) SID.rank_to_left = SID.n_proc-1; // Intitialize log timing information SID.time_start_level=(time_t *)SID_malloc(sizeof(time_t)*SID_LOG_MAX_LEVELS); SID.time_stop_level =(time_t *)SID_malloc(sizeof(time_t)*SID_LOG_MAX_LEVELS); SID.time_total_level=(int *)SID_malloc(sizeof(int) *SID_LOG_MAX_LEVELS); SID.IO_size =(double *)SID_malloc(sizeof(double)*SID_LOG_MAX_LEVELS); SID.flag_use_timer =(int *)SID_malloc(sizeof(int) *SID_LOG_MAX_LEVELS); for(i_level=0;i_level<SID_LOG_MAX_LEVELS;i_level++){ SID.time_start_level[i_level]=0; SID.time_stop_level[i_level] =0; SID.time_total_level[i_level]=0; SID.IO_size[i_level] =0.; SID.flag_use_timer[i_level] =FALSE; } // Initialize other log information #if USE_MPI if(*argc>1) SID.fp_in =fopen((*argv)[1],"r"); else SID.fp_in =NULL; #else SID.fp_in =stdin; #endif if (flag_passed_comm) SID.fp_log = NULL; else SID.fp_log = stderr; SID.level =0; SID.indent =TRUE; SID.awake =TRUE; SID.flag_results_on=FALSE; SID.verbosity =SID_LOG_MAX_LEVELS; // Store the name of the binary executable that brought us here strcpy(SID.My_binary,(*argv)[0]); strip_path(SID.My_binary); // Initialize argument information if(args!=NULL){ if((status=SID_parse_args(*argc,*argv,args))>0){ SID_print_syntax(*argc,*argv,args); SID_exit(status); } } else SID.args=NULL; #if USE_MPI_IO if(SID.I_am_Master){ fp_hack=fopen(".tmp.SID","w+"); fclose(fp_hack); } MPI_Barrier(mpi_comm); SID_fopen(".tmp.SID","w",&fp_tmp); MPI_File_get_info(fp_tmp.fp,&info_disp); if(SID.I_am_Master){ fprintf(stdout,"\n"); fprintf(stdout,"MPI-I/O Configuration:\n"); fprintf(stdout,"---------------------\n"); MPI_Info_get_nkeys(info_disp,&n_keys); for(i_key=0;i_key<n_keys;i_key++){ MPI_Info_get_nthkey(info_disp,i_key,key); MPI_Info_get(info_disp,key,MPI_MAX_INFO_VAL,key_value,&key_exists); if(key_exists) fprintf(stdout,"key %2d of %d: {%s}={%s}\n",i_key+1,n_keys,key,key_value); } fprintf(stdout,"\n"); } SID_fclose(&fp_tmp); if(SID.I_am_Master) remove(".tmp.SID"); #else #if USE_MPI if(SID.I_am_Master) fprintf(stdout,"MPI-I/O switched off.\n\n"); #endif #endif // Create private COMM_WORLD SID_Comm_init(&(SID.COMM_WORLD)); #if USE_MPI MPI_Comm_dup(mpi_comm, &((SID.COMM_WORLD)->comm)); MPI_Comm_group((SID.COMM_WORLD)->comm,&((SID.COMM_WORLD)->group)); MPI_Comm_size(SID.COMM_WORLD->comm, &((SID.COMM_WORLD)->n_proc)); MPI_Comm_rank(SID.COMM_WORLD->comm, &((SID.COMM_WORLD)->My_rank)); // We have duplicated our duplicate mpi communicator - now we can free the // original duplicate MPI_Comm_free(&mpi_comm); #else SID.COMM_WORLD->comm =NULL; SID.COMM_WORLD->group =NULL; SID.COMM_WORLD->n_proc =1; SID.COMM_WORLD->My_rank=MASTER_RANK; #endif // Start total-run-ime timer (void)time(&(SID.time_start)); // Default max wallclock SID.max_wallclock=DEFAULT_MAX_WALLCLOCK_TIME; }
int main( int argc, char *argv[] ) { MPI_Info infos[MAX_INFOS]; char key[64], value[64]; int errs = 0; int i, j; MTest_Init( &argc, &argv ); /* We create max_info items, then delete the middle third of them, then recreate them, then check them, then delete them all. This checks that the MPICH algorithm for handling large numbers of items works correctly; other MPI implementations should also be able to handle this */ /* Create them all */ for (i=0; i<MAX_INFOS; i++) { MPI_Info_create( &infos[i] ); DBGPRINTF( ( "Info handle is %x\n", infos[i] ) ); for (j=0; j<info_list; j++) { sprintf( key, "key%d-%d", i, j ); sprintf( value, "value%d-%d", i, j ); DBGPRINTF( ( "Creating key/value %s=%s\n", key, value )); MPI_Info_set( infos[i], key, value ); } #ifdef DBG { int nkeys; MPI_Info_get_nkeys( infos[0], &nkeys ); if (nkeys != info_list) { printf( "infos[0] changed at %d info\n", i );} } #endif } /* Delete the middle set */ for (i=MAX_INFOS/3; i<(2*MAX_INFOS/3); i++) { MPI_Info_free( &infos[i] ); } /* Recreate the middle set */ for (i=MAX_INFOS/3; i<(2*MAX_INFOS/3); i++) { MPI_Info_create( &infos[i] ); DBGPRINTF( ( "Info handle is %x\n", infos[i] ) ); for (j=0; j<info_list; j++) { sprintf( key, "key%d-%d", i, j ); sprintf( value, "value%d-%d", i, j ); DBGPRINTF( ( "Creating key/value %s=%s\n", key, value )); MPI_Info_set( infos[i], key, value ); } } /* Now, check that they are still valid */ for (i=0; i<MAX_INFOS; i++) { int nkeys; /*printf( "info = %x\n", infos[i] ); print_handle( infos[i] ); printf( "\n" );*/ MPI_Info_get_nkeys( infos[i], &nkeys ); if (nkeys != info_list) { errs++; if (errs < MAX_ERRORS) { printf( "Wrong number of keys for info %d; got %d, should be %d\n", i, nkeys, info_list ); } } for (j=0; j<nkeys; j++) { char keystr[64]; char valstr[64]; int flag; MPI_Info_get_nthkey( infos[i], j, key ); sprintf( keystr, "key%d-%d", i, j ); if (strcmp( keystr, key ) != 0) { errs++; if (errs < MAX_ERRORS) { printf( "Wrong key for info %d; got %s expected %s\n", i, key, keystr ); } continue; } MPI_Info_get( infos[i], key, 64, value, &flag ); if (!flag) { errs++; if (errs < MAX_ERRORS) { printf( "Get failed to return value for info %d\n", i ); } continue; } sprintf( valstr, "value%d-%d", i, j ); if (strcmp( valstr, value ) != 0) { errs++; if (errs < MAX_ERRORS) { printf( "Wrong value for info %d; got %s expected %s\n", i, value, valstr ); } } } } for (i=0; i<MAX_INFOS; i++) { MPI_Info_free( &infos[i] ); } MTest_Finalize( errs ); MPI_Finalize( ); return 0; }
int main(int argc, char** argv) { extern int optind; char *filename="testfile.nc", exec[128]; int i, j, k, n, rank, nprocs, verbose=1, err; int ncid, cmode, varid[4], dimid[2], nreqs, reqs[4], sts[4]; unsigned int *buffer[4]; int num_segs[4], req_lens[4]; MPI_Offset ***starts, ***counts; MPI_Info info; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); strcpy(exec, argv[0]); /* get command-line arguments */ while ((i = getopt(argc, argv, "hq")) != EOF) switch(i) { case 'q': verbose = 0; break; case 'h': default: if (rank==0) usage(argv[0]); MPI_Finalize(); return 0; } argc -= optind; argv += optind; if (argc == 1) filename = argv[0]; /* optional argument */ if (nprocs != 4 && rank == 0 && verbose) printf("Warning: %s is intended to run on 4 processes\n",exec); /* set an MPI-IO hint to disable file offset alignment for fix-sized * variables */ MPI_Info_create(&info); MPI_Info_set(info, "nc_var_align_size", "1"); /* create a new file for writing ----------------------------------------*/ cmode = NC_CLOBBER | NC_64BIT_DATA; err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); ERR MPI_Info_free(&info); /* create a global array of size NY * NX */ err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); ERR err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); ERR err = ncmpi_def_var(ncid, "var0", NC_UINT, NDIMS, dimid, &varid[0]); ERR err = ncmpi_def_var(ncid, "var1", NC_UINT, NDIMS, dimid, &varid[1]); ERR err = ncmpi_def_var(ncid, "var2", NC_UINT, NDIMS, dimid, &varid[2]); ERR err = ncmpi_def_var(ncid, "var3", NC_UINT, NDIMS, dimid, &varid[3]); ERR err = ncmpi_enddef(ncid); ERR /* allocate space for starts and counts */ starts = calloc_3D(4, 6, NDIMS); counts = calloc_3D(4, 6, NDIMS); n = rank % 4; num_segs[n] = 4; /* number of segments for this request */ starts[n][0][0]=0; starts[n][0][1]=5; counts[n][0][0]=1; counts[n][0][1]=2; starts[n][1][0]=1; starts[n][1][1]=0; counts[n][1][0]=1; counts[n][1][1]=1; starts[n][2][0]=2; starts[n][2][1]=6; counts[n][2][0]=1; counts[n][2][1]=2; starts[n][3][0]=3; starts[n][3][1]=0; counts[n][3][0]=1; counts[n][3][1]=3; /* starts[n][][] n_counts[n][][] indicate the following: ("-" means skip) - - - - - X X - - - X - - - - - - - - - - - - - - - X X - - X X X - - - - - - - */ n = (rank+1) % 4; num_segs[n] = 6; /* number of segments for this request */ starts[n][0][0]=0; starts[n][0][1]=3; counts[n][0][0]=1; counts[n][0][1]=2; starts[n][1][0]=0; starts[n][1][1]=8; counts[n][1][0]=1; counts[n][1][1]=2; starts[n][2][0]=1; starts[n][2][1]=5; counts[n][2][0]=1; counts[n][2][1]=2; starts[n][3][0]=2; starts[n][3][1]=0; counts[n][3][0]=1; counts[n][3][1]=2; starts[n][4][0]=2; starts[n][4][1]=8; counts[n][4][0]=1; counts[n][4][1]=2; starts[n][5][0]=3; starts[n][5][1]=4; counts[n][5][0]=1; counts[n][5][1]=3; /* starts[n][][] counts[n][][] indicate the following pattern. - - - X X - - - X X - - - - - X X - - - X X - - - - - - X X - - - - X X X - - - */ n = (rank+2) % 4; num_segs[n] = 5; /* number of segments for this request */ starts[n][0][0]=0; starts[n][0][1]=7; counts[n][0][0]=1; counts[n][0][1]=1; starts[n][1][0]=1; starts[n][1][1]=1; counts[n][1][0]=1; counts[n][1][1]=3; starts[n][2][0]=1; starts[n][2][1]=7; counts[n][2][0]=1; counts[n][2][1]=3; starts[n][3][0]=2; starts[n][3][1]=2; counts[n][3][0]=1; counts[n][3][1]=1; starts[n][4][0]=3; starts[n][4][1]=3; counts[n][4][0]=1; counts[n][4][1]=1; /* starts[n][][] counts[n][][] indicate the following pattern. - - - - - - - X - - - X X X - - - X X X - - X - - - - - - - - - - X - - - - - - */ n = (rank+3) % 4; num_segs[n] = 4; /* number of segments for this request */ starts[n][0][0]=0; starts[n][0][1]=0; counts[n][0][0]=1; counts[n][0][1]=3; starts[n][1][0]=1; starts[n][1][1]=4; counts[n][1][0]=1; counts[n][1][1]=1; starts[n][2][0]=2; starts[n][2][1]=3; counts[n][2][0]=1; counts[n][2][1]=3; starts[n][3][0]=3; starts[n][3][1]=7; counts[n][3][0]=1; counts[n][3][1]=3; /*starts[n][][] counts[n][][] indicate the following pattern. X X X - - - - - - - - - - - X - - - - - - - - X X X - - - - - - - - - - - X X X */ /* only rank 0, 1, 2, and 3 do I/O: * each of ranks 0 to 3 write 4 nonblocking requests */ nreqs = 4; if (rank >= 4) nreqs = 0; /* bufsize must be max of data type converted before and after */ MPI_Offset bufsize = 0; /* calculate length of each varn request, number of segments in each * varn request, and allocate write buffer */ for (i=0; i<nreqs; i++) { req_lens[i] = 0; /* total length this request */ for (j=0; j<num_segs[i]; j++) { MPI_Offset req_len=1; for (k=0; k<NDIMS; k++) req_len *= counts[i][j][k]; req_lens[i] += req_len; } if (verbose) printf("req_lens[%d]=%d\n",i,req_lens[i]); /* allocate I/O buffer and initialize its contents */ buffer[i] = (unsigned int*) malloc(req_lens[i] * sizeof(unsigned int)); for (j=0; j<req_lens[i]; j++) buffer[i][j] = rank; bufsize += req_lens[i]; } bufsize *= sizeof(unsigned int); if (verbose) printf("%d: Attach buffer size %lld\n", rank, bufsize); /* give PnetCDF a space to buffer the nonblocking requests */ if (bufsize > 0) { err = ncmpi_buffer_attach(ncid, bufsize); ERR }
int MTestGetWin(MPI_Win * win, int mustBePassive) { static char actbuf[1024]; static char *pasbuf; char *buf; int n, rank, merr; MPI_Info info; if (mem_keyval == MPI_KEYVAL_INVALID) { /* Create the keyval */ merr = MPI_Win_create_keyval(MPI_WIN_NULL_COPY_FN, MPI_WIN_NULL_DELETE_FN, &mem_keyval, 0); if (merr) MTestPrintError(merr); } switch (win_index) { case 0: /* Active target window */ merr = MPI_Win_create(actbuf, 1024, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win); if (merr) MTestPrintError(merr); winName = "active-window"; merr = MPI_Win_set_attr(*win, mem_keyval, (void *) 0); if (merr) MTestPrintError(merr); break; case 1: /* Passive target window */ merr = MPI_Alloc_mem(1024, MPI_INFO_NULL, &pasbuf); if (merr) MTestPrintError(merr); merr = MPI_Win_create(pasbuf, 1024, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win); if (merr) MTestPrintError(merr); winName = "passive-window"; merr = MPI_Win_set_attr(*win, mem_keyval, (void *) 2); if (merr) MTestPrintError(merr); break; case 2: /* Active target; all windows different sizes */ merr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (merr) MTestPrintError(merr); n = rank * 64; if (n) buf = (char *) malloc(n); else buf = 0; merr = MPI_Win_create(buf, n, 1, MPI_INFO_NULL, MPI_COMM_WORLD, win); if (merr) MTestPrintError(merr); winName = "active-all-different-win"; merr = MPI_Win_set_attr(*win, mem_keyval, (void *) 1); if (merr) MTestPrintError(merr); break; case 3: /* Active target, no locks set */ merr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (merr) MTestPrintError(merr); n = rank * 64; if (n) buf = (char *) malloc(n); else buf = 0; merr = MPI_Info_create(&info); if (merr) MTestPrintError(merr); merr = MPI_Info_set(info, (char *) "nolocks", (char *) "true"); if (merr) MTestPrintError(merr); merr = MPI_Win_create(buf, n, 1, info, MPI_COMM_WORLD, win); if (merr) MTestPrintError(merr); merr = MPI_Info_free(&info); if (merr) MTestPrintError(merr); winName = "active-nolocks-all-different-win"; merr = MPI_Win_set_attr(*win, mem_keyval, (void *) 1); if (merr) MTestPrintError(merr); break; default: win_index = -1; } win_index++; return win_index; }
/* * Function: h5_set_info_object * Purpose: Process environment variables setting to set up MPI Info * object. * Return: 0 if all is fine; otherwise non-zero. * Programmer: Albert Cheng, 2002/05/21. * Modifications: * Bill Wendling, 2002/05/31 * Modified so that the HDF5_MPI_INFO environment variable can * be a semicolon separated list of "key=value" pairings. Most * of the code is to remove any whitespaces which might be * surrounding the "key=value" pairs. */ int h5_set_info_object(void) { char *envp; /* environment pointer */ int ret_value=0; /* handle any MPI INFO hints via $HDF5_MPI_INFO */ if ((envp = getenv("HDF5_MPI_INFO")) != NULL){ char *next, *valp; valp = envp = next = HDstrdup(envp); if (!valp) return 0; /* create an INFO object if not created yet */ if (h5_io_info_g == MPI_INFO_NULL) MPI_Info_create(&h5_io_info_g); do { size_t len; char *key_val, *endp, *namep; if (*valp == ';') valp++; /* copy key/value pair into temporary buffer */ len = strcspn(valp, ";"); next = &valp[len]; key_val = (char *)HDcalloc(1, len + 1); /* increment the next pointer past the terminating semicolon */ if (*next == ';') ++next; namep = HDstrncpy(key_val, valp, len); /* pass up any beginning whitespaces */ while (*namep && (*namep == ' ' || *namep == '\t')) namep++; if (!*namep) continue; /* was all white space, so move to next k/v pair */ /* eat up any ending white spaces */ endp = &namep[HDstrlen(namep) - 1]; while (endp && (*endp == ' ' || *endp == '\t')) *endp-- = '\0'; /* find the '=' */ valp = HDstrchr(namep, '='); if (valp != NULL) { /* it's a valid key/value pairing */ char *tmp_val = valp + 1; /* change '=' to \0, move valp down one */ *valp-- = '\0'; /* eat up ending whitespace on the "key" part */ while (*valp == ' ' || *valp == '\t') *valp-- = '\0'; valp = tmp_val; /* eat up beginning whitespace on the "value" part */ while (*valp == ' ' || *valp == '\t') *valp++ = '\0'; /* actually set the darned thing */ if (MPI_SUCCESS != MPI_Info_set(h5_io_info_g, namep, valp)) { printf("MPI_Info_set failed\n"); ret_value = -1; } } valp = next; HDfree(key_val); } while (next && *next); HDfree(envp); } return ret_value; }
int main(int argc, char* argv[]) { int err = 0; int ecode = 0; int ncid; int cmode, format; int nprocs, rank; MPI_Comm comm=MPI_COMM_SELF; MPI_Info info=MPI_INFO_NULL; printf("\n*** Testing nc_inq_format_extended for pnetcdf..."); MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (nprocs > 1 && rank == 0) printf("This test program is intended to run on ONE process\n"); if (rank > 0) goto fn_exit; /* first, use PnetCDF to create a file with default header/variable alignment */ #ifdef DISABLE_PNETCDF_ALIGNMENT MPI_Info_create(&info); MPI_Info_set(info, "nc_header_align_size", "1"); MPI_Info_set(info, "nc_var_align_size", "1"); #endif /* test CDF-1 file format */ cmode = NC_PNETCDF | NC_CLOBBER; if (nc_create_par(FILENAME, cmode, comm, info, &ncid)) ERR_RET; if (nc_enddef(ncid)) ERR; if(nc_inq_format_extended(ncid,&format,&cmode)) ERR; if((cmode & NC_PNETCDF) != NC_PNETCDF) { printf("***FAIL at line %d: mode was %08x ; expected %08x\n",__LINE__,cmode,NC_PNETCDF); ecode = 1; ERR; } if(format != NC_FORMATX_PNETCDF) { printf("***FAIL at line %d: format was %d ; expected %d\n",__LINE__,format,NC_FORMATX_PNETCDF); ecode = 1; ERR; } /* test CDF-2 file format */ cmode = NC_PNETCDF | NC_CLOBBER | NC_64BIT_OFFSET; if (nc_create_par(FILENAME, cmode, comm, info, &ncid)) ERR_RET; if (nc_enddef(ncid)) ERR; if(nc_inq_format_extended(ncid,&format,&cmode)) ERR; if((cmode & NC_64BIT_OFFSET) != NC_64BIT_OFFSET) { printf("***FAIL at line %d: mode was %08x ; expected %08x\n",__LINE__,cmode,NC_64BIT_OFFSET); ecode = 1; ERR; } if(format != NC_FORMATX_PNETCDF) { printf("***FAIL at line %d: format was %d ; expected %d\n",__LINE__,format,NC_FORMATX_PNETCDF); ecode = 1; ERR; } /* test CDF-5 file format */ cmode = NC_PNETCDF | NC_CLOBBER | NC_64BIT_DATA; if (nc_create_par(FILENAME, cmode, comm, info, &ncid)) ERR_RET; if (nc_enddef(ncid)) ERR; if(nc_inq_format_extended(ncid,&format,&cmode)) ERR; if((cmode & NC_64BIT_DATA) != NC_64BIT_DATA) { printf("***FAIL at line %d: mode was %08x ; expected %08x\n",__LINE__,cmode,NC_64BIT_DATA); ecode = 1; ERR; } if(format != NC_FORMATX_PNETCDF) { printf("***FAIL at line %d: format was %d ; expected %d\n",__LINE__,format,NC_FORMATX_PNETCDF); ecode = 1; ERR; } if (nc_abort(ncid)) ERR; fn_exit: MPI_Finalize(); SUMMARIZE_ERR; FINAL_RESULTS; return ecode; }
int main(int argc, char **argv) { MPI_Info info_in, info_out; int errors = 0, all_errors = 0; MPI_Win win; void *base; char invalid_key[] = "invalid_test_key"; char buf[MPI_MAX_INFO_VAL]; int flag; MPI_Comm shm_comm = MPI_COMM_NULL; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); /* Test#1: setting a valid key at window-create time */ MPI_Info_create(&info_in); MPI_Info_set(info_in, "no_locks", "true"); MPI_Win_allocate(sizeof(int), sizeof(int), info_in, MPI_COMM_WORLD, &base, &win); errors += check_win_info_get(win, "no_locks", "true"); MPI_Info_free(&info_in); /* We create a new window with no info argument for the next text to ensure that we have the * default settings */ MPI_Win_free(&win); MPI_Win_allocate(sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &base, &win); /* Test#2: setting and getting invalid key */ win_info_set(win, invalid_key, "true"); MPI_Win_get_info(win, &info_out); MPI_Info_get(info_out, invalid_key, MPI_MAX_INFO_VAL, buf, &flag); #ifndef USE_STRICT_MPI /* Check if our invalid key was ignored. Note, this check's MPICH's * behavior, but this behavior may not be required for a standard * conforming MPI implementation. */ if (flag) { printf("%d: %s was not ignored\n", rank, invalid_key); errors++; } #endif MPI_Info_free(&info_out); /* Test#3: setting info key "no_lock" (no default value) */ win_info_set(win, "no_locks", "false"); errors += check_win_info_get(win, "no_locks", "false"); win_info_set(win, "no_locks", "true"); errors += check_win_info_get(win, "no_locks", "true"); /* Test#4: getting/setting "accumulate_ordering" */ /* #4.1: is the default "rar,raw,war,waw" as stated in the standard? */ errors += check_win_info_get(win, "accumulate_ordering", "rar,raw,war,waw"); /* #4.2: setting "accumulate_ordering" to "none" */ win_info_set(win, "accumulate_ordering", "none"); errors += check_win_info_get(win, "accumulate_ordering", "none"); /* #4.3: setting "accumulate_ordering" to "rar,waw" */ win_info_set(win, "accumulate_ordering", "rar,waw"); errors += check_win_info_get(win, "accumulate_ordering", "rar,waw"); /* Test#5: getting/setting "accumulate_ops" */ /* #5.1: is the default "same_op_no_op" as stated in the standard? */ errors += check_win_info_get(win, "accumulate_ops", "same_op_no_op"); /* #5.2: setting "accumulate_ops" to "same_op" */ win_info_set(win, "accumulate_ops", "same_op"); errors += check_win_info_get(win, "accumulate_ops", "same_op"); /* Test#6: setting "same_size" (no default value) */ win_info_set(win, "same_size", "false"); errors += check_win_info_get(win, "same_size", "false"); win_info_set(win, "same_size", "true"); errors += check_win_info_get(win, "same_size", "true"); /* Test#7: setting "same_disp_unit" (no default value) */ win_info_set(win, "same_disp_unit", "false"); errors += check_win_info_get(win, "same_disp_unit", "false"); win_info_set(win, "same_disp_unit", "true"); errors += check_win_info_get(win, "same_disp_unit", "true"); /* TODO: check alloc_shm as implementation-specific test */ /* Test#8: setting "alloc_shared_noncontig" (no default value) in shared window. */ MPI_Win_free(&win); /* #8.1: setting at window allocation */ MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm); MPI_Info_create(&info_in); MPI_Info_set(info_in, "alloc_shared_noncontig", "true"); MPI_Win_allocate_shared(sizeof(int), sizeof(int), info_in, shm_comm, &base, &win); errors += check_win_info_get(win, "alloc_shared_noncontig", "true"); MPI_Info_free(&info_in); /* #8.2: setting info */ win_info_set(win, "alloc_shared_noncontig", "false"); errors += check_win_info_get(win, "alloc_shared_noncontig", "false"); MPI_Comm_free(&shm_comm); MPI_Win_free(&win); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && all_errors == 0) printf(" No Errors\n"); MPI_Finalize(); return 0; }
int main(int argc, char** argv) { extern int optind; char *filename="testfile.nc"; int i, rank, nprocs, verbose=1, err; int ncid, cmode, varid, dimid[2], num_reqs, *buffer, **bufs, *nvarids; MPI_Offset w_len, **starts, **counts, *bufcounts; MPI_Datatype *datatypes; MPI_Info info; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* get command-line arguments */ while ((i = getopt(argc, argv, "hq")) != EOF) switch(i) { case 'q': verbose = 0; break; case 'h': default: if (rank==0) usage(argv[0]); MPI_Finalize(); return 0; } argc -= optind; argv += optind; if (argc == 1) filename = argv[0]; /* optional argument */ if (nprocs != 4 && rank == 0 && verbose) printf("Warning: this program is intended to run on 4 processes\n"); /* set an MPI-IO hint to disable file offset alignment for fix-sized * variables */ MPI_Info_create(&info); MPI_Info_set(info, "nc_var_align_size", "1"); /* create a new file for writing ----------------------------------------*/ cmode = NC_CLOBBER | NC_64BIT_DATA; err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); ERR MPI_Info_free(&info); /* create a global array of size NY * NX */ err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); ERR err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); ERR err = ncmpi_def_var(ncid, "var", NC_INT, NDIMS, dimid, &varid); ERR err = ncmpi_enddef(ncid); ERR /* pick arbitrary numbers of requests for 4 processes */ num_reqs = 0; if (rank == 0) num_reqs = 4; else if (rank == 1) num_reqs = 6; else if (rank == 2) num_reqs = 5; else if (rank == 3) num_reqs = 4; starts = (MPI_Offset**) malloc(num_reqs * sizeof(MPI_Offset*)); counts = (MPI_Offset**) malloc(num_reqs * sizeof(MPI_Offset*)); starts[0] = (MPI_Offset*) calloc(num_reqs * NDIMS, sizeof(MPI_Offset)); counts[0] = (MPI_Offset*) calloc(num_reqs * NDIMS, sizeof(MPI_Offset)); for (i=1; i<num_reqs; i++) { starts[i] = starts[i-1] + NDIMS; counts[i] = counts[i-1] + NDIMS; } /* assign arbitrary starts and counts */ const int y=0, x=1; if (rank == 0) { starts[0][y] = 0; starts[0][x] = 5; counts[0][y] = 1; counts[0][x] = 2; starts[1][y] = 1; starts[1][x] = 0; counts[1][y] = 1; counts[1][x] = 1; starts[2][y] = 2; starts[2][x] = 6; counts[2][y] = 1; counts[2][x] = 2; starts[3][y] = 3; starts[3][x] = 0; counts[3][y] = 1; counts[3][x] = 3; /* rank 0 is writing the followings: ("-" means skip) - - - - - 0 0 - - - 0 - - - - - - - - - - - - - - - 0 0 - - 0 0 0 - - - - - - - */ } else if (rank ==1) { starts[0][y] = 0; starts[0][x] = 3; counts[0][y] = 1; counts[0][x] = 2; starts[1][y] = 0; starts[1][x] = 8; counts[1][y] = 1; counts[1][x] = 2; starts[2][y] = 1; starts[2][x] = 5; counts[2][y] = 1; counts[2][x] = 2; starts[3][y] = 2; starts[3][x] = 0; counts[3][y] = 1; counts[3][x] = 2; starts[4][y] = 2; starts[4][x] = 8; counts[4][y] = 1; counts[4][x] = 2; starts[5][y] = 3; starts[5][x] = 4; counts[5][y] = 1; counts[5][x] = 3; /* rank 1 is writing the followings: ("-" means skip) - - - 1 1 - - - 1 1 - - - - - 1 1 - - - 1 1 - - - - - - 1 1 - - - - 1 1 1 - - - */ } else if (rank ==2) { starts[0][y] = 0; starts[0][x] = 7; counts[0][y] = 1; counts[0][x] = 1; starts[1][y] = 1; starts[1][x] = 1; counts[1][y] = 1; counts[1][x] = 3; starts[2][y] = 1; starts[2][x] = 7; counts[2][y] = 1; counts[2][x] = 3; starts[3][y] = 2; starts[3][x] = 2; counts[3][y] = 1; counts[3][x] = 1; starts[4][y] = 3; starts[4][x] = 3; counts[4][y] = 1; counts[4][x] = 1; /* rank 2 is writing the followings: ("-" means skip) - - - - - - - 2 - - - 2 2 2 - - - 2 2 2 - - 2 - - - - - - - - - - 2 - - - - - - */ } else if (rank ==3) { starts[0][y] = 0; starts[0][x] = 0; counts[0][y] = 1; counts[0][x] = 3; starts[1][y] = 1; starts[1][x] = 4; counts[1][y] = 1; counts[1][x] = 1; starts[2][y] = 2; starts[2][x] = 3; counts[2][y] = 1; counts[2][x] = 3; starts[3][y] = 3; starts[3][x] = 7; counts[3][y] = 1; counts[3][x] = 3; /* rank 3 is writing the followings: ("-" means skip) 3 3 3 - - - - - - - - - - - 3 - - - - - - - - 3 3 3 - - - - - - - - - - - 3 3 3 */ } nvarids = (int*) malloc(num_reqs * sizeof(int)); bufcounts = (MPI_Offset*) malloc(num_reqs * sizeof(MPI_Offset)); datatypes = (MPI_Datatype*) malloc(num_reqs * sizeof(MPI_Datatype)); w_len = 0; for (i=0; i<num_reqs; i++) { nvarids[i] = varid; bufcounts[i] = counts[i][x]; datatypes[i] = MPI_INT; w_len += bufcounts[i]; } /* allocate I/O buffer and initialize its contents */ buffer = (int*) malloc(w_len * sizeof(int)); for (i=0; i<w_len; i++) buffer[i] = rank; /* set the buffer pointers to different offsets to the I/O buffer */ bufs = (int**) malloc(num_reqs * sizeof(int*)); bufs[0] = buffer; for (i=1; i<num_reqs; i++) bufs[i] = bufs[i-1] + bufcounts[i-1]; err = ncmpi_mput_vara_all(ncid, num_reqs, nvarids, starts, counts, (void**)bufs, bufcounts, datatypes); ERR err = ncmpi_close(ncid); ERR free(buffer); free(bufs); free(nvarids); free(bufcounts); free(datatypes); free(starts[0]); free(counts[0]); free(starts); free(counts); /* check if there is any PnetCDF internal malloc residue */ MPI_Offset malloc_size, sum_size; err = ncmpi_inq_malloc_size(&malloc_size); if (err == NC_NOERR) { MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && sum_size > 0) printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", sum_size); } MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { int Block_order; size_t Block_size; size_t Colblock_size; int Tile_order=32; int tiling; int Num_procs; /* Number of ranks */ int order; /* overall matrix order */ int send_to, recv_from; /* communicating ranks */ size_t bytes; /* total amount of data to be moved */ int my_ID; /* rank */ int root=0; /* root rank of a communicator */ int iterations; /* number of times to run the pipeline algorithm */ int i, j, it, jt, ID;/* dummies */ int iter; /* index of iteration */ int phase; /* phase in the staged communication */ size_t colstart; /* sequence number of first column owned by calling rank */ int error=0; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double *Work_in_p; /* workspace for the transpose function */ double *Work_out_p;/* workspace for the transpose function */ double abserr, abserr_tot; /* computed error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Status status; /* completion status of message */ MPI_Win shm_win_A; /* Shared Memory window object */ MPI_Win shm_win_B; /* Shared Memory window object */ MPI_Win shm_win_Work_in; /* Shared Memory window object */ MPI_Win shm_win_Work_out; /* Shared Memory window object */ MPI_Info rma_winfo;/* info for window */ MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of ranks in shared domain */ int shm_ID; /* MPI rank within coherence domain */ int group_size; /* number of ranks per shared memory group */ int Num_groups; /* number of shared memory group */ int group_ID; /* sequence number of shared memory group */ int size_mul; /* size multiplier; 0 for non-root ranks in coherence domain*/ int istart; MPI_Request send_req, recv_req; /********************************************************************************* ** Initialize the MPI environment **********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); root = 0; /********************************************************************* ** process, test and broadcast input parameter *********************************************************************/ if (my_ID == root){ if (argc != 4 && argc !=5){ printf("Usage: %s <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc == 5) Tile_order = atoi(*++argv); ENDOFTESTS:; } bail_out(error); /* Broadcast input data to all ranks */ MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM Matrix transpose: B = A^T\n"); printf("Number of ranks = %d\n", Num_procs); printf("Rank group size = %d\n", group_size); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); #ifndef SYNCHRONOUS printf("Non-"); #endif printf("Blocking messages\n"); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that a SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Num_groups = Num_procs/group_size; Block_order = order/Num_groups; group_ID = my_ID/group_size; colstart = Block_order * group_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the column block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ /* RMA win info */ MPI_Info_create(&rma_winfo); /* This key indicates that passive target RMA will not be used. * It is the one info key that MPICH actually uses for optimization. */ MPI_Info_set(rma_winfo, "no_locks", "true"); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); int offset = 32; MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &A_p, &shm_win_A); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_A); MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); A_p += offset; /* recompute memory size (overwritten by prior query */ size= (Colblock_size+offset)*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &B_p, &shm_win_B); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_B); MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p); if (B_p == NULL){ printf(" Error allocating space for transposed matrix by group %d\n",group_ID); error = 1; } bail_out(error); B_p += offset; if (Num_groups>1) { size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, (void *) &Work_in_p, &shm_win_Work_in); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_in); MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_in_p); if (Work_in_p == NULL){ printf(" Error allocating space for in block by group %d\n",group_ID); error = 1; } bail_out(error); /* recompute memory size (overwritten by prior query */ size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &Work_out_p, &shm_win_Work_out); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_out); MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_out_p); if (Work_out_p == NULL){ printf(" Error allocating space for out block by group %d\n",group_ID); error = 1; } bail_out(error); } /* Fill the original column matrix */ istart = 0; int chunk_size = Block_order/group_size; if (tiling) { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) { for (i=0;i<order; i+=Tile_order) for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++) for (it=i; it<MIN(order,i+Tile_order); it++) { A(it,jt) = (double) ((double)order*(jt+colstart) + it); B(it,jt) = -1.0; } } } else { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) for (i=0;i<order; i++) { A(i,j) = (double)((double)order*(j+colstart) + i); B(i,j) = -1.0; } } /* NEED A STORE FENCE HERE */ MPI_Win_sync(shm_win_A); MPI_Win_sync(shm_win_B); MPI_Barrier(shm_comm); for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) { for (j=0; j<Block_order; j++) B(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) = A(it,jt); } } } for (phase=1; phase<Num_groups; phase++){ recv_from = ((group_ID + phase )%Num_groups); send_to = ((group_ID - phase + Num_groups)%Num_groups); istart = send_to*Block_order; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); } } /* NEED A LOAD/STORE FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); if (shm_ID==0) { #ifndef SYNCHRONOUS /* if we place the Irecv outside this block, it would not be protected by a local barrier, which creates a race */ MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req); MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, MPI_COMM_WORLD, &send_req); MPI_Wait(&recv_req, &status); MPI_Wait(&send_req, &status); #else MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &status); #endif } /* NEED A LOAD FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0; i<Block_order; i++) B(i,j) = Work_in(i,j); } /* end of phase loop */ } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; /* for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0;i<order; i++) { abserr += ABS(B(i,j) - (double)((double)order*i + j+colstart)); } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr_tot); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); MPI_Win_unlock_all(shm_win_A); MPI_Win_unlock_all(shm_win_B); MPI_Win_free(&shm_win_A); MPI_Win_free(&shm_win_B); if (Num_groups>1) { MPI_Win_unlock_all(shm_win_Work_in); MPI_Win_unlock_all(shm_win_Work_out); MPI_Win_free(&shm_win_Work_in); MPI_Win_free(&shm_win_Work_out); } MPI_Info_free(&rma_winfo); MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0, len; static char myname[] = "ADIOI_GEN_SETINFO"; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); if (value == NULL) { /* NEED TO HANDLE ENOMEM */ } /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { /* buffer size for collective I/O */ MPI_Info_set(info, "cb_buffer_size", ADIOI_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ MPI_Info_set(info, "romio_cb_read", "automatic"); fd->hints->cb_read = ADIOI_HINT_AUTO; MPI_Info_set(info, "romio_cb_write", "automatic"); fd->hints->cb_write = ADIOI_HINT_AUTO; fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); MPI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = nprocs; /* hint indicating that no indep. I/O will be performed on this file */ MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* deferred_open derrived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ MPI_Info_set(info, "ind_rd_buffer_size", ADIOI_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ MPI_Info_set(info, "ind_wr_buffer_size", ADIOI_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_IND_WR_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use data * sieving */ MPI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; MPI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { MPI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_buffer_size", error_code); return; } /* --END ERROR HANDLING-- */ MPI_Info_set(info, "cb_buffer_size", value); fd->hints->cb_buffer_size = intval; } /* new hints for enabling/disabling coll. buffering on * reads/writes */ MPI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_read overrides no_indep_rw */ MPI_Info_set(info, "romio_cb_read", value); MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_read = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_read; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_read", error_code); return; } /* --END ERROR HANDLING-- */ } MPI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_write overrides no_indep_rw, too */ MPI_Info_set(info, "romio_cb_write", value); MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_write = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_write", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hint for specifying no indep. read/write will be performed */ MPI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ MPI_Info_set(info, "romio_no_indep_rw", value); MPI_Info_set(info, "romio_cb_write", "enable"); MPI_Info_set(info, "romio_cb_read", "enable"); fd->hints->no_indep_rw = 1; fd->hints->cb_read = 1; fd->hints->cb_write = 1; tmp_val = 1; } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { MPI_Info_set(info, "romio_no_indep_rw", value); fd->hints->no_indep_rw = 0; tmp_val = 0; } else { /* default is above */ tmp_val = 0; } MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->no_indep_rw) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_no_indep_rw", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hints for enabling/disabling data sieving on * reads/writes */ MPI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { MPI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_AUTO; } /* otherwise ignore */ } MPI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { MPI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* otherwise ignore */ } MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_nodes", error_code); return; } /* --END ERROR HANDLING-- */ if (!nprocs_is_valid) { /* if hints were already initialized, we might not * have already gotten this? */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; } if (intval <= nprocs) { MPI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = intval; } } MPI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { MPI_Info_set(info, "ind_wr_buffer_size", value); fd->hints->ind_wr_buffer_size = intval; } MPI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { MPI_Info_set(info, "ind_rd_buffer_size", value); fd->hints->ind_rd_buffer_size = intval; } MPI_Info_get(users_info, "cb_config_list", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (fd->hints->cb_config_list == NULL) { /* only set cb_config_list if it isn't already set. * Note that since we set it below, this ensures that * the cb_config_list hint will be set at file open time * either by the user or to the default */ MPI_Info_set(info, "cb_config_list", value); len = (strlen(value)+1) * sizeof(char); fd->hints->cb_config_list = ADIOI_Malloc(len); if (fd->hints->cb_config_list == NULL) { /* NEED TO HANDLE ENOMEM */ } ADIOI_Strncpy(fd->hints->cb_config_list, value, len); } /* if it has been set already, we ignore it the second time. * otherwise we would get an error if someone used the same * info value with a cb_config_list value in it in a couple * of calls, which would be irritating. */ } } /* handle cb_config_list default value here; avoids an extra * free/alloc and insures it is always set */ if (fd->hints->cb_config_list == NULL) { MPI_Info_set(info, "cb_config_list", ADIOI_CB_CONFIG_LIST_DFLT); len = (strlen(ADIOI_CB_CONFIG_LIST_DFLT)+1) * sizeof(char); fd->hints->cb_config_list = ADIOI_Malloc(len); if (fd->hints->cb_config_list == NULL) { /* NEED TO HANDLE ENOMEM */ } ADIOI_Strncpy(fd->hints->cb_config_list, ADIOI_CB_CONFIG_LIST_DFLT, len); } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \ && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\ && fd->hints->no_indep_rw ) ) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } if ((fd->file_system == ADIO_PIOFS) || (fd->file_system == ADIO_PVFS) || (fd->file_system == ADIO_PVFS2) ) { /* no data sieving for writes in PIOFS, PVFS and PVFS2, because they do not support file locking */ MPI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ MPI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ MPI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
/*------------------------------------------------------------------------- * Function: test_fapl_mpio_dup * * Purpose: Test if fapl_mpio property list keeps a duplicate of the * communicator and INFO objects given when set; and returns * duplicates of its components when H5Pget_fapl_mpio is called. * * Return: Success: None * * Failure: Abort * * Programmer: Albert Cheng * January 9, 2003 * * Modifications: *------------------------------------------------------------------------- */ void test_fapl_mpio_dup(void) { int mpi_size, mpi_rank; MPI_Comm comm, comm_tmp; int mpi_size_old, mpi_rank_old; int mpi_size_tmp, mpi_rank_tmp; MPI_Info info = MPI_INFO_NULL; MPI_Info info_tmp = MPI_INFO_NULL; int mrc; /* MPI return value */ hid_t acc_pl; /* File access properties */ herr_t ret; /* hdf5 return value */ int nkeys, nkeys_tmp; if (VERBOSE_MED) printf("Verify fapl_mpio duplicates communicator and INFO objects\n"); /* set up MPI parameters */ MPI_Comm_size(MPI_COMM_WORLD,&mpi_size); MPI_Comm_rank(MPI_COMM_WORLD,&mpi_rank); if (VERBOSE_MED) printf("rank/size of MPI_COMM_WORLD are %d/%d\n", mpi_rank, mpi_size); /* Create a new communicator that has the same processes as MPI_COMM_WORLD. * Use MPI_Comm_split because it is simplier than MPI_Comm_create */ mrc = MPI_Comm_split(MPI_COMM_WORLD, 0, 0, &comm); VRFY((mrc==MPI_SUCCESS), "MPI_Comm_split"); MPI_Comm_size(comm,&mpi_size_old); MPI_Comm_rank(comm,&mpi_rank_old); if (VERBOSE_MED) printf("rank/size of comm are %d/%d\n", mpi_rank_old, mpi_size_old); /* create a new INFO object with some trivial information. */ mrc = MPI_Info_create(&info); VRFY((mrc==MPI_SUCCESS), "MPI_Info_create"); mrc = MPI_Info_set(info, "hdf_info_name", "XYZ"); VRFY((mrc==MPI_SUCCESS), "MPI_Info_set"); if (MPI_INFO_NULL != info){ mrc=MPI_Info_get_nkeys(info, &nkeys); VRFY((mrc==MPI_SUCCESS), "MPI_Info_get_nkeys"); } if (VERBOSE_MED) h5_dump_info_object(info); acc_pl = H5Pcreate (H5P_FILE_ACCESS); VRFY((acc_pl >= 0), "H5P_FILE_ACCESS"); ret = H5Pset_fapl_mpio(acc_pl, comm, info); VRFY((ret >= 0), ""); /* Case 1: * Free the created communicator and INFO object. * Check if the access property list is still valid and can return * valid communicator and INFO object. */ mrc = MPI_Comm_free(&comm); VRFY((mrc==MPI_SUCCESS), "MPI_Comm_free"); if (MPI_INFO_NULL!=info){ mrc = MPI_Info_free(&info); VRFY((mrc==MPI_SUCCESS), "MPI_Info_free"); } ret = H5Pget_fapl_mpio(acc_pl, &comm_tmp, &info_tmp); VRFY((ret >= 0), "H5Pget_fapl_mpio"); MPI_Comm_size(comm_tmp,&mpi_size_tmp); MPI_Comm_rank(comm_tmp,&mpi_rank_tmp); if (VERBOSE_MED) printf("After H5Pget_fapl_mpio: rank/size of comm are %d/%d\n", mpi_rank_tmp, mpi_size_tmp); VRFY((mpi_size_tmp==mpi_size), "MPI_Comm_size"); VRFY((mpi_rank_tmp==mpi_rank), "MPI_Comm_rank"); if (MPI_INFO_NULL != info_tmp){ mrc=MPI_Info_get_nkeys(info_tmp, &nkeys_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Info_get_nkeys"); VRFY((nkeys_tmp==nkeys), "new and old nkeys equal"); } if (VERBOSE_MED) h5_dump_info_object(info_tmp); /* Case 2: * Free the retrieved communicator and INFO object. * Check if the access property list is still valid and can return * valid communicator and INFO object. * Also verify the NULL argument option. */ mrc = MPI_Comm_free(&comm_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Comm_free"); if (MPI_INFO_NULL!=info_tmp){ mrc = MPI_Info_free(&info_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Info_free"); } /* check NULL argument options. */ ret = H5Pget_fapl_mpio(acc_pl, &comm_tmp, NULL); VRFY((ret >= 0), "H5Pget_fapl_mpio Comm only"); mrc = MPI_Comm_free(&comm_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Comm_free"); ret = H5Pget_fapl_mpio(acc_pl, NULL, &info_tmp); VRFY((ret >= 0), "H5Pget_fapl_mpio Info only"); if (MPI_INFO_NULL!=info_tmp){ mrc = MPI_Info_free(&info_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Info_free"); } ret = H5Pget_fapl_mpio(acc_pl, NULL, NULL); VRFY((ret >= 0), "H5Pget_fapl_mpio neither"); /* now get both and check validity too. */ /* Donot free the returned objects which are used in the next case. */ ret = H5Pget_fapl_mpio(acc_pl, &comm_tmp, &info_tmp); VRFY((ret >= 0), "H5Pget_fapl_mpio"); MPI_Comm_size(comm_tmp,&mpi_size_tmp); MPI_Comm_rank(comm_tmp,&mpi_rank_tmp); if (VERBOSE_MED) printf("After second H5Pget_fapl_mpio: rank/size of comm are %d/%d\n", mpi_rank_tmp, mpi_size_tmp); VRFY((mpi_size_tmp==mpi_size), "MPI_Comm_size"); VRFY((mpi_rank_tmp==mpi_rank), "MPI_Comm_rank"); if (MPI_INFO_NULL != info_tmp){ mrc=MPI_Info_get_nkeys(info_tmp, &nkeys_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Info_get_nkeys"); VRFY((nkeys_tmp==nkeys), "new and old nkeys equal"); } if (VERBOSE_MED) h5_dump_info_object(info_tmp); /* Case 3: * Close the property list and verify the retrieved communicator and INFO * object are still valid. */ H5Pclose(acc_pl); MPI_Comm_size(comm_tmp,&mpi_size_tmp); MPI_Comm_rank(comm_tmp,&mpi_rank_tmp); if (VERBOSE_MED) printf("After Property list closed: rank/size of comm are %d/%d\n", mpi_rank_tmp, mpi_size_tmp); if (MPI_INFO_NULL != info_tmp){ mrc=MPI_Info_get_nkeys(info_tmp, &nkeys_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Info_get_nkeys"); } if (VERBOSE_MED) h5_dump_info_object(info_tmp); /* clean up */ mrc = MPI_Comm_free(&comm_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Comm_free"); if (MPI_INFO_NULL!=info_tmp){ mrc = MPI_Info_free(&info_tmp); VRFY((mrc==MPI_SUCCESS), "MPI_Info_free"); } }
int main (int argc, char *argv[]) { int rank, destrank, nprocs, i; int size, page_size, no_hints = 0; char *A, *B; char *s_buf, *r_buf; MPI_Group comm_group, group; MPI_Win win; MPI_Info win_info; double t_start=0.0, t_end=0.0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (nprocs != 2) { if (rank == 0) { fprintf(stderr, "This test requires exactly two processes\n"); } MPI_Finalize(); return EXIT_FAILURE; } while (1) { static struct option long_options[] = {{"no-hints", no_argument, NULL, 'n'}, {0, 0, 0, 0}}; int option, index; option = getopt_long (argc, argv, "n::", long_options, &index); if (option == -1) { break; } switch (option) { case 'n': no_hints = 1; break; default: if (rank == 0) { fprintf(stderr, "Invalid Option \n"); } MPI_Finalize(); return EXIT_FAILURE; } } if (no_hints == 0) { /* Providing MVAPICH2 specific hint to allocate memory * in shared space. MVAPICH2 optimizes communication * on windows created in this memory */ MPI_Info_create(&win_info); MPI_Info_set(win_info, "alloc_shm", "true"); MPI_Alloc_mem (MYBUFSIZE, win_info, &A); } else { MPI_Alloc_mem (MYBUFSIZE, MPI_INFO_NULL, &A); } if (NULL == A) { fprintf(stderr, "[%d] Buffer Allocation Failed \n", rank); exit(-1); } MPI_Alloc_mem (MYBUFSIZE, MPI_INFO_NULL, &B); if (NULL == B) { fprintf(stderr, "[%d] Buffer Allocation Failed \n", rank); exit(-1); } page_size = getpagesize(); assert(page_size <= MAX_ALIGNMENT); s_buf = (char *) (((unsigned long) A + (page_size - 1)) / page_size * page_size); r_buf = (char *) (((unsigned long) B + (page_size - 1)) / page_size * page_size); memset(r_buf, 0, MAX_SIZE); memset(s_buf, 1, MAX_SIZE); if (rank == 0) { fprintf(stdout, HEADER); fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Latency (us)"); fflush(stdout); } MPI_Comm_group(MPI_COMM_WORLD, &comm_group); for (size = 0; size <= MAX_SIZE; size = (size ? size * 2 : 1)) { if (size > large_message_size) { loop = loop_large; skip = skip_large; } MPI_Win_create(s_buf, size, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); if (rank == 0) { destrank = 1; MPI_Group_incl(comm_group, 1, &destrank, &group); MPI_Barrier(MPI_COMM_WORLD); for (i = 0; i < skip + loop; i++) { MPI_Win_start(group, 0, win); if (i == skip) { t_start = MPI_Wtime (); } MPI_Get(r_buf, size, MPI_CHAR, 1, 0, size, MPI_CHAR, win); MPI_Win_complete(win); MPI_Win_post(group, 0, win); MPI_Win_wait(win); } t_end = MPI_Wtime (); } else { /* rank=1 */ destrank = 0; MPI_Group_incl(comm_group, 1, &destrank, &group); MPI_Barrier(MPI_COMM_WORLD); for (i = 0; i < skip + loop; i++) { MPI_Win_post(group, 0, win); MPI_Win_wait(win); MPI_Win_start(group, 0, win); MPI_Get(r_buf, size, MPI_CHAR, 0, 0, size, MPI_CHAR, win); MPI_Win_complete(win); } } MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, FLOAT_PRECISION, (t_end - t_start) * 1.0e6 / loop / 2); fflush(stdout); } MPI_Group_free(&group); MPI_Win_free(&win); } if (no_hints == 0) { MPI_Info_free(&win_info); } MPI_Free_mem(A); MPI_Free_mem(B); MPI_Group_free(&comm_group); MPI_Finalize(); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { int error; int rank, size; char *argv1[2] = { (char*)"connector", NULL }; char *argv2[2] = { (char*)"acceptor", NULL }; MPI_Comm comm_connector, comm_acceptor, comm_parent, comm; char port[MPI_MAX_PORT_NAME]; MPI_Status status; MPI_Info spawn_path = MPI_INFO_NULL; int verbose = 0; if (getenv("MPITEST_VERBOSE")) { verbose = 1; } IF_VERBOSE(("init.\n")); error = MPI_Init(&argc, &argv); check_error(error, "MPI_Init"); /* To improve reporting of problems about operations, we change the error handler to errors return */ MPI_Comm_set_errhandler( MPI_COMM_WORLD, MPI_ERRORS_RETURN ); MPI_Comm_set_errhandler( MPI_COMM_SELF, MPI_ERRORS_RETURN ); IF_VERBOSE(("size.\n")); error = MPI_Comm_size(MPI_COMM_WORLD, &size); check_error(error, "MPI_Comm_size"); IF_VERBOSE(("rank.\n")); error = MPI_Comm_rank(MPI_COMM_WORLD, &rank); check_error(error, "MPI_Comm_rank"); if (argc == 1) { /* Make sure that the current directory is in the path. Not all implementations may honor or understand this, but it is highly recommended as it gives users a clean way to specify the location of the executable without specifying a particular directory format (e.g., this should work with both Windows and Unix implementations) */ error = MPI_Info_create( &spawn_path ); check_error( error, "MPI_Info_create" ); error = MPI_Info_set( spawn_path, (char*)"path", (char*)"." ); check_error( error, "MPI_Info_set" ); IF_VERBOSE(("spawn connector.\n")); error = MPI_Comm_spawn((char*)"spaconacc", argv1, 1, spawn_path, 0, MPI_COMM_SELF, &comm_connector, MPI_ERRCODES_IGNORE); check_error(error, "MPI_Comm_spawn"); IF_VERBOSE(("spawn acceptor.\n")); error = MPI_Comm_spawn((char*)"spaconacc", argv2, 1, spawn_path, 0, MPI_COMM_SELF, &comm_acceptor, MPI_ERRCODES_IGNORE); check_error(error, "MPI_Comm_spawn"); error = MPI_Info_free( &spawn_path ); check_error( error, "MPI_Info_free" ); MPI_Comm_set_errhandler( comm_connector, MPI_ERRORS_RETURN ); MPI_Comm_set_errhandler( comm_acceptor, MPI_ERRORS_RETURN ); IF_VERBOSE(("recv port.\n")); error = MPI_Recv(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_acceptor, &status); check_error(error, "MPI_Recv"); IF_VERBOSE(("send port.\n")); error = MPI_Send(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_connector); check_error(error, "MPI_Send"); IF_VERBOSE(("barrier acceptor.\n")); error = MPI_Barrier(comm_acceptor); check_error(error, "MPI_Barrier"); IF_VERBOSE(("barrier connector.\n")); error = MPI_Barrier(comm_connector); check_error(error, "MPI_Barrier"); error = MPI_Comm_free(&comm_acceptor); check_error(error, "MPI_Comm_free"); error = MPI_Comm_free(&comm_connector); check_error(error, "MPI_Comm_free"); printf(" No Errors\n"); } else if ((argc == 2) && (strcmp(argv[1], "acceptor") == 0)) { IF_VERBOSE(("get_parent.\n")); error = MPI_Comm_get_parent(&comm_parent); check_error(error, "MPI_Comm_get_parent"); if (comm_parent == MPI_COMM_NULL) { printf("acceptor's parent is NULL.\n");fflush(stdout); MPI_Abort(MPI_COMM_WORLD, -1); } IF_VERBOSE(("open_port.\n")); error = MPI_Open_port(MPI_INFO_NULL, port); check_error(error, "MPI_Open_port"); MPI_Comm_set_errhandler( comm_parent, MPI_ERRORS_RETURN ); IF_VERBOSE(("0: opened port: <%s>\n", port)); IF_VERBOSE(("send.\n")); error = MPI_Send(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_parent); check_error(error, "MPI_Send"); IF_VERBOSE(("accept.\n")); error = MPI_Comm_accept(port, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm); check_error(error, "MPI_Comm_accept"); IF_VERBOSE(("close_port.\n")); error = MPI_Close_port(port); check_error(error, "MPI_Close_port"); IF_VERBOSE(("disconnect.\n")); error = MPI_Comm_disconnect(&comm); check_error(error, "MPI_Comm_disconnect"); IF_VERBOSE(("barrier.\n")); error = MPI_Barrier(comm_parent); check_error(error, "MPI_Barrier"); MPI_Comm_free( &comm_parent ); } else if ((argc == 2) && (strcmp(argv[1], "connector") == 0)) { IF_VERBOSE(("get_parent.\n")); error = MPI_Comm_get_parent(&comm_parent); check_error(error, "MPI_Comm_get_parent"); if (comm_parent == MPI_COMM_NULL) { printf("acceptor's parent is NULL.\n");fflush(stdout); MPI_Abort(MPI_COMM_WORLD, -1); } MPI_Comm_set_errhandler( comm_parent, MPI_ERRORS_RETURN ); IF_VERBOSE(("recv.\n")); error = MPI_Recv(port, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, comm_parent, &status); check_error(error, "MPI_Recv"); IF_VERBOSE(("1: received port: <%s>\n", port)); IF_VERBOSE(("connect.\n")); error = MPI_Comm_connect(port, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm); check_error(error, "MPI_Comm_connect"); MPI_Comm_set_errhandler( comm, MPI_ERRORS_RETURN ); IF_VERBOSE(("disconnect.\n")); error = MPI_Comm_disconnect(&comm); check_error(error, "MPI_Comm_disconnect"); IF_VERBOSE(("barrier.\n")); error = MPI_Barrier(comm_parent); check_error(error, "MPI_Barrier"); MPI_Comm_free( &comm_parent ); } else { printf("invalid command line.\n");fflush(stdout); { int i; for (i=0; i<argc; i++) { printf("argv[%d] = <%s>\n", i, argv[i]); } } fflush(stdout); MPI_Abort(MPI_COMM_WORLD, -2); } MPI_Finalize(); return 0; }
void mpi_info_set_( int *info, char *key, char *value, int* ierr){ *ierr = MPI_Info_set( *(MPI_Info *)info, key, value); }