T gather(T* send_local, T* recv_root) { return MPI_Gather(send_local, 1, detail::MpiDataType<T>::value, recv_root, 1, detail::MpiDataType<T>::value, 0, MPI_COMM_WORLD); }
/** @param sbuf the (filled) array of ice grid values for this MPI node. */ void GCMCoupler::couple_to_ice( double time_s, int nfields, // Number of fields in sbuf. Not all will necessarily be filled, in the case of heterogeneous ice models. giss::DynArray<SMBMsg> &sbuf, // Values, already converted to ice model inputs (from gcm outputs) std::vector<giss::VectorSparseVector<int,double>> &gcm_ivals) // Root node only: Already-allocated space to put output values. Members as defined by the CouplingContract GCMCoupler::gcm_inputs { // TODO: Convert this to use giss::gather_msg_array() instead!!! // Gather buffers on root node int num_mpi_nodes; MPI_Comm_size(gcm_params.gcm_comm, &num_mpi_nodes); int const rank = gcm_params.gcm_rank; printf("[%d] BEGIN GCMCoupler::couple_to_ice() time_s=%f, sbuf.size=%d, sbuf.ele_size=%d\n", gcm_params.gcm_rank, time_s, sbuf.size, sbuf.ele_size); // MPI_Gather the count std::unique_ptr<int[]> rcounts; rcounts.reset(new int[num_mpi_nodes]); for (int i=0; i<num_mpi_nodes; ++i) rcounts[i] = 0; printf("[%d] EE1\n", rank); int nele_l = sbuf.size; MPI_Gather(&nele_l, 1, MPI_INT, &rcounts[0], 1, MPI_INT, gcm_params.gcm_root, gcm_params.gcm_comm); // Compute displacements as prefix sum of rcounts std::unique_ptr<int[]> displs; std::unique_ptr<giss::DynArray<SMBMsg>> rbuf; printf("[%d] EE2\n", rank); displs.reset(new int[num_mpi_nodes+1]); displs[0] = 0; for (int i=0; i<num_mpi_nodes; ++i) displs[i+1] = displs[i] + rcounts[i]; int nele_g = displs[num_mpi_nodes]; // Create receive buffer, and gather into it // (There's an extra item in the array for a sentinel) rbuf.reset(new giss::DynArray<SMBMsg>(SMBMsg::size(nfields), nele_g+1)); printf("[%d] EE3\n", rank); MPI_Datatype mpi_type(SMBMsg::new_MPI_struct(nfields)); MPI_Gatherv(sbuf.begin().get(), sbuf.size, mpi_type, rbuf->begin().get(), &rcounts[0], &displs[0], mpi_type, gcm_params.gcm_root, gcm_params.gcm_comm); MPI_Type_free(&mpi_type); printf("[%d] EE4\n", rank); if (am_i_root()) { printf("[%d] EE5\n", rank); // (ONLY ON GCM ROOT) // Clear output arrays, which will be filled in additively // on each ice model // for (auto ov=gcm_ivals.begin(); ov != gcm_ivals.end(); ++ov) *ov = 0; // Add a sentinel (*rbuf)[rbuf->size-1].sheetno = 999999; // Sort the receive buffer so items in same ice sheet // are found together qsort(rbuf->begin().get(), rbuf->size, rbuf->ele_size, &SMBMsg::compar); printf("[%d] EE6\n", rank); // (ONLY ON GCM ROOT) // Figure out which ice sheets we have data for auto lscan(rbuf->begin()); auto rscan(lscan); std::map<int, CallIceModelParams> im_params; while (rscan < rbuf->end()) { if (rscan->sheetno != lscan->sheetno) { int sheetno = lscan->sheetno; auto cimp(CallIceModelParams(sheetno, lscan.get(), rscan.get())); im_params[sheetno] = cimp; lscan = rscan; } ++rscan; } // (ONLY ON GCM ROOT) // NOTE: call_ice_model() is called (below) even on NON-ROOT // Call all our ice models for (auto model = models.begin(); model != models.end(); ++model) { int sheetno = model.key(); // Assume we have data for all ice models // (So we can easily maintain MPI SIMD operation) auto params(im_params.find(sheetno)); call_ice_model(&*model, sheetno, time_s, *rbuf, params->second.begin, params->second.next); // Convert to variables the GCM wants (but still on the ice grid) model->set_gcm_inputs(0); // Fills in gcm_ivals_I // Free ice_ovals_I model->free_ice_ovals_I(); } regrid_gcm_inputs_onroot(gcm_ivals, 0); } else { // (ONLY ON NOT GCM ROOT) // We're not root --- we have no data to send to ice // models, we just call through anyway because we will // receive data in an upcomming MPI_Scatter // Call all our ice models for (auto model = models.begin(); model != models.end(); ++model) { int sheetno = model.key(); // Assume we have data for all ice models // (So we can easily maintain MPI SIMD operation) call_ice_model(&*model, sheetno, time_s, *rbuf, NULL, NULL); } // if (gcm_params.gcm_rank == gcm_params.gcm_root) } printf("[%d] END GCMCoupler::couple_to_ice()\n", gcm_params.gcm_rank); }
int main(int argc, char *argv[]) { ps_status_t rv = PS_SUCCESS; int log_level = LOG_LEVEL_NONE; char *search = NULL, *path = NULL; size_t search_len = 0; int i = 0, c = 0; int number_of_procs = 0, own_rank = 0; int *slave_nodes = NULL; unsigned long chunk_size = DEFAULT_CHUNK_SIZE; ps_searcher_t *searcher = NULL; ps_search_task_t *task = NULL; char *result = NULL; size_t result_len = 0, total_result_len = 0, *all_result_len = NULL; int search_col = PS_CSV_ALL_COL; #ifdef TIME_MEASUREMENT float total_seconds = 0, total_search_time = 0, total_file_io_time = 0, total_setup_time = 0, total_reduce_time = 0; struct timeval time_start, current_time; gettimeofday(&time_start, NULL); memcpy(¤t_time, &time_start, sizeof(struct timeval)); #endif out_fd = stdout; /*For Logging*/ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &number_of_procs); MPI_Comm_rank (MPI_COMM_WORLD, &own_rank); /*set log level until arguments are passed and processed*/ set_log_level(log_level); if (own_rank == MASTER) { opterr = 0; while ((c = getopt (argc, argv, "hds:f:c:l:")) != -1) { switch (c) { case 'd': log_level = LOG_LEVEL_DEBUG; break; case 'f': path = optarg; break; case 's': search = optarg; search_len = strlen(search); break; case 'c': PS_CHECK_ZERO_GO_ERR( (chunk_size = atol(optarg)), PS_ERROR_WRONG_CHUNK_SIZE); break; case 'l': search_col = atoi(optarg); break; case 'h': break; default: if (optopt == 's') fprintf (stderr, "Option -%c requires an argument.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); rv = PS_ERROR_WRONG_ARGUMENTS; goto error; } } } /*Communicate and set log_level*/ PS_MPI_CHECK_ERR(MPI_Bcast(&log_level, 1, MPI_INT, MASTER, MPI_COMM_WORLD)); set_log_level(log_level); /*Communicate the token to search for*/ PS_MPI_CHECK_ERR(MPI_Bcast(&search_len, 1, MPI_UNSIGNED_LONG, MASTER, MPI_COMM_WORLD)); if (own_rank != MASTER) { PS_MALLOC(search, sizeof(char) * (search_len + 1)); } PS_MPI_CHECK_ERR(MPI_Bcast(search, search_len + 1, MPI_CHAR, MASTER, MPI_COMM_WORLD)); log_debug("Process %d: search_len:%u search:%s", own_rank, search_len, search); #ifdef TIME_MEASUREMENT if(own_rank == MASTER) { update_timestamp_and_total_seconds(¤t_time, &total_setup_time); } #endif if (own_rank == MASTER) { log_debug("search = %s, path = %s, chunk_size=%lu", search, path, chunk_size); log_debug("sizeof(ps_search_task_t:%lu", sizeof(ps_search_task_t)); for (i = optind; i < argc; i++) { log_debug("Non-option argument %s", argv[i]); } /*-------------------Processing of arguments done!-----------*/ log_debug("Number of procs:%d", number_of_procs); PS_MALLOC( slave_nodes, sizeof(int) * (number_of_procs)); for (i = 0; i < number_of_procs - 1; i++) { slave_nodes[i] = i + 1; } PS_CHECK_GOTO_ERROR( distribute_path_and_search_range(path, number_of_procs , slave_nodes, chunk_size, search_col, MPI_COMM_WORLD, &task)); /*Slaves receive path_length and search_task*/ PS_CHECK_GOTO_ERROR(ps_file_searcher_create(&searcher, search, task)); PS_CHECK_GOTO_ERROR(ps_file_searcher_search(searcher, &result, &result_len)); log_debug("Process %d: result_len: %lu", own_rank, result_len); PS_CHECK_GOTO_ERROR(ps_file_searcher_free(&searcher)); PS_MALLOC(all_result_len, sizeof(size_t) * number_of_procs); } else { /*Slaves receive path_length and search_task*/ PS_CHECK_GOTO_ERROR(recv_task(&task, own_rank, MASTER, MPI_COMM_WORLD)); PS_CHECK_GOTO_ERROR(ps_file_searcher_create(&searcher, search, task)); PS_CHECK_GOTO_ERROR(ps_file_searcher_search(searcher, &result, &result_len)); log_debug("Process %d: result_len: %lu", own_rank, result_len); PS_CHECK_GOTO_ERROR(ps_file_searcher_free(&searcher)); } #ifdef TIME_MEASUREMENT if(own_rank == MASTER) { gettimeofday(¤t_time, NULL); } #endif PS_MPI_CHECK_ERR(MPI_Gather(&result_len, 1, MPI_UNSIGNED_LONG, all_result_len, 1, MPI_UNSIGNED_LONG, MASTER, MPI_COMM_WORLD)); if(own_rank == MASTER) { for(i = 0; i < number_of_procs; i++){ total_result_len += all_result_len[i]; } log_debug("Process %d: total_result_len:%lu", own_rank, total_result_len); PS_REALLOC(result, total_result_len); for(i = 1; i < number_of_procs; i++){ PS_MPI_CHECK_ERR(MPI_Recv(result + all_result_len[i - 1], all_result_len[i], MPI_CHAR, i, PS_MPI_TAG_RESULT, MPI_COMM_WORLD, MPI_STATUSES_IGNORE)); result_len += all_result_len[i]; } write(STDOUT_FILENO, result, result_len); } else { PS_MPI_CHECK_ERR(MPI_Send(result, result_len, MPI_CHAR, MASTER, PS_MPI_TAG_RESULT, MPI_COMM_WORLD)); PS_FREE(search); } log_debug("Process %d finished", own_rank); #ifdef TIME_MEASUREMENT if(own_rank == MASTER) { update_timestamp_and_total_seconds(¤t_time, &total_reduce_time); } printf("Process %d: process_search_time: %f, process_file_io_time: %f\n", own_rank ,process_search_time, process_file_io_time); PS_MPI_CHECK_ERR(MPI_Reduce(&process_search_time, &total_search_time, 1, MPI_FLOAT, MPI_SUM, MASTER, MPI_COMM_WORLD)); PS_MPI_CHECK_ERR(MPI_Reduce(&process_file_io_time , &total_file_io_time, 1, MPI_FLOAT, MPI_SUM, MASTER, MPI_COMM_WORLD)); update_timestamp_and_total_seconds(&time_start, &total_seconds); if(own_rank == MASTER) { printf("Total-Time: %.3fs\n" "\ttotal_setup_time: %.3fs\n" "\ttotal_reduce_time: %.3fs\n" "\taverage-io-time: %.3fs\n" "\taverage-search-time: %.3fs\n" "processes: %d\n" "chunksize: %lu Bytes\n", total_seconds, total_setup_time, total_reduce_time, total_file_io_time / number_of_procs, total_search_time / number_of_procs, number_of_procs, chunk_size); } #endif MPI_Finalize(); PS_FREE(slave_nodes); PS_FREE(result); PS_FREE(all_result_len); return EXIT_SUCCESS; /*-----------------ERROR-Handling------------------------------*/ error: log_err("Process %d finished with error: %d", own_rank, rv); MPI_Finalize(); if (own_rank != MASTER) { PS_FREE(search); } PS_FREE(slave_nodes); if(searcher) { PS_FREE(searcher->task); } PS_FREE(result); PS_FREE(all_result_len); return rv; }
int main(int argc, char **argv) { int num1, num2, proc_num, proc_rank, comp_result, i; int buf1[10], buf2[10], buf_result[10]; MPI_Status status; MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &proc_num ); MPI_Comm_rank( MPI_COMM_WORLD, &proc_rank ); if( 10 != proc_num ) // проверка на 10 процессов { if( 0 == proc_rank ) printf("Wrong number of processes!\n"); MPI_Finalize(); return 0; } if( 0 == proc_rank ) // считываем десятичные числа в 0 процессе, они известны только ему { scanf("%d%d", &num1, &num2 ); if( num1>1000 ) // ограничения num1 = MAX_NUM; if( num2>1000 ) num2 = MAX_NUM; MPI_Send( &num1, 1, MPI_INT, 1, 0, MPI_COMM_WORLD ); // сделать другой коммуникатор - разослать только им? MPI_Send( &num2, 1, MPI_INT, 2, 0, MPI_COMM_WORLD ); // посылаем числа процессу 1 и 2 } if( 1 == proc_rank ) // перевод в бинарный вид в этих процессах { MPI_Recv( &num1, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status ); for( i=0;i<10;i++) buf1[i] = *( dec_to_bin( num1 ) + i ); printf("The first number is: "); for( i=0;i<10;i++) printf("%d ", buf1[i]); printf("\n"); } if( 2 == proc_rank) { MPI_Recv( &num2, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status ); for(i=0;i<10;i++) buf2[i] = *( dec_to_bin( num2 ) + 1 ); printf("The second number is: "); for(i=0;i<10;i++) printf("%d ", buf2[i] ); printf("\n"); } MPI_Bcast( buf1, 10, MPI_INT, 1, MPI_COMM_WORLD ); // теперь бинарный вид у всех MPI_Bcast( buf2, 10, MPI_INT, 2, MPI_COMM_WORLD ); comp_result = ( buf1[proc_rank] == buf2[proc_rank] ) ? 0 : 1; // каждый процесс пар-но сравнивает разряд MPI_Barrier( MPI_COMM_WORLD ); MPI_Gather( &comp_result, 1, MPI_INT, buf_result, 1, MPI_INT, 0, MPI_COMM_WORLD ); // рез-т сравнения на 0 процессе if ( 0 == proc_rank ) { for(i=0;i<10;i++) printf("%d ", buf_result[i] ); i = 0; int flag = 1; while( 0 == buf_result[i] ) // По порядку анализируем разряды чисел - равны или не равны { i++; if( 10 == i) { printf("Numbers are equal!\n"); flag = 0; } } if(flag) { if( buf1[i] > buf2[i] ) printf("\nFirst number is bigger!\n"); else printf("\nSecond number is bigger!\n"); } } MPI_Finalize(); return 0; }
int main(int argc, char* argv[]) { int myrank; int nodes; double *a; int i, j, k, noTest; Arguments arguments; arguments.matrix_size = -1; arguments.matrice = arguments.solution = NULL; arguments.imprimer = 0; arguments.pivot = 0; arguments.write = NULL; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&myrank); MPI_Comm_size(MPI_COMM_WORLD, &nodes); for(i = 1; i < argc; i++) { if(argv[i][0] == '-') { switch(argv[i][1]) { case 'm': if(i+1 < argc) { arguments.matrix_size = atoi(argv[++i]); } break; case 'f': if(i+1 < argc) { arguments.matrice = argv[++i]; } break; case 's': if(i+1 < argc) { arguments.solution = argv[++i]; } break; case 'p': arguments.pivot = 1; break; case 'i': arguments.imprimer = 1; break; case 'w': if(i+1 < argc) { arguments.write = argv[++i]; } break; default: fprintf(stderr, "Usage: %s -m <tailleMatrice> "\ "[-f nomFichier] [-s solution]\n", argv[0]); exit(EXIT_FAILURE); } } } if(arguments.matrix_size < 0) { fprintf(stderr,"Erreur: vous devez fournir une taille de matrice.\n"); exit(EXIT_FAILURE); } if(arguments.matrix_size % nodes != 0) { fprintf(stderr, "Erreur: le nombre de noeuds doit diviser la"\ "taille de la matrice.\n"); exit(EXIT_FAILURE); } MPI_Barrier( MPI_COMM_WORLD ); int nbLines = arguments.matrix_size / nodes; if(arguments.matrice) { a = readMatrixCyclic(arguments.matrice, arguments.matrix_size, nbLines, myrank, nodes); } else { a = (double *) malloc( nbLines*arguments.matrix_size*sizeof(double) ); initializeMatrix( arguments.matrix_size, nbLines, a); } double tempsEcoule; MPI_Barrier( MPI_COMM_WORLD ); tempsEcoule = -MPI_Wtime(); factoriserLU(a, arguments.matrix_size, nbLines, MPI_COMM_WORLD, myrank, arguments.pivot, nodes); double tempsMax; tempsEcoule += MPI_Wtime(); //On ramene le temps max au processeur 0 MPI_Reduce(&tempsEcoule, &tempsMax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if(arguments.solution) { //on vérifie le résultat double *resultat = readMatrixCyclic(arguments.solution, arguments.matrix_size, nbLines, myrank, nodes); checkResult(resultat, a, arguments.matrix_size, nbLines, MPI_COMM_WORLD); if(myrank == 0) { printf("Resultat de la factorisation lu correcte.\n"); } if(resultat) free(resultat); } if (myrank == 0) { printf( "Factorisation LU %s pivot effectuee en %.3f msecs\n", (arguments.pivot?"avec":"sans"), 1000*tempsEcoule); } if(arguments.imprimer || arguments.write) { //On test pour récupérer la matrice double *matriceComplete; if(myrank == 0) { matriceComplete = (double *) malloc(arguments.matrix_size* arguments.matrix_size*sizeof(double)); } for( i = 0; i < nbLines; i++) { MPI_Gather(&a[i*arguments.matrix_size], arguments.matrix_size, MPI_DOUBLE, &matriceComplete[i*nodes*arguments.matrix_size], arguments.matrix_size, MPI_DOUBLE, 0, MPI_COMM_WORLD); } if( myrank == 0) { if(arguments.imprimer) { printMatrix(arguments.matrix_size, arguments.matrix_size, matriceComplete); } if(arguments.write) { writeMatrix(arguments.matrix_size, arguments.matrix_size, matriceComplete, arguments.write); } if(matriceComplete) free(matriceComplete); } } #ifndef NDEBUG printf("rank G:%i\n",myrank); #endif if(a)free(a); MPI_Finalize(); return 0; }
void AllgatherDomains(std::set<int> &setOfDomain){ int i = 0; int numLDomains = (int)setOfDomain.size(); int domainsarray[numLDomains]; std::set<int>::iterator iter = setOfDomain.begin(); for (;iter != setOfDomain.end(); iter++) domainsarray[i++] = *iter; int numGDomains[P_size()]; MPI_Gather(&numLDomains,1,MPI_INT,numGDomains,1,MPI_INT,0,MPI_COMM_WORLD); // if (!P_pid()){ // for(i=0; i<P_size(); i++) printf("rank %d receives %d domains from rank %d\n",P_pid(),numGDomains[i],i); // } // allocate enough space to receive nodes from all processors int *recv_buffer2, *displacements; int totalDoms = 0; if ( !P_pid() ){ for(i=0; i<P_size(); i++) totalDoms += numGDomains[i]; recv_buffer2 = new int[totalDoms]; // only root processor allocates memory displacements = new int[P_size()]; displacements[0] = 0; for (int i=1; i<P_size(); i++) displacements[i] = displacements[i-1] + numGDomains[i-1]; } // now it's time to send nodes to root processor MPI_Gatherv(domainsarray,numLDomains,MPI_INT, recv_buffer2,numGDomains,displacements,MPI_INT, 0,MPI_COMM_WORLD); // if (!P_pid()){ // for(i=0; i<totalDoms; i++) printf("rank %d domains %d\n",P_pid(),recv_buffer2[i]); // } // let's filter domains flags to avoid repeated values setOfDomain.clear(); if (!P_pid()){ for(i=0; i<totalDoms; i++) setOfDomain.insert( recv_buffer2[i] ); } // printf("rank %d setOfDomain.size() = %d\n",P_pid(),setOfDomain.size()); // if (!P_pid()){ // for (iter = setOfDomain.begin(); iter != setOfDomain.end(); iter++) printf("rank %d domains %d\n",P_pid(),*iter); // } // Send these domains flags to all processes i = 0; int numGDomains2 = (int)setOfDomain.size(); numGDomains2 = P_getSumInt(numGDomains2); int domainsGarray[numGDomains2]; // if (!P_pid()){ for (iter = setOfDomain.begin(); iter != setOfDomain.end(); iter++) domainsGarray[i++] = *iter; // } MPI_Bcast(domainsGarray,numGDomains2,MPI_INT,0,MPI_COMM_WORLD); for(i=0; i<numGDomains2; i++) setOfDomain.insert( domainsGarray[i] ); //printf("rank %d numGDomains2 %d\n",P_pid(),numGDomains2); // for(i=0; i<numGDomains2; i++) printf("rank %d domains %d\n",P_pid(),domainsGarray[i]); }
int main(int argc, char *argv[]) { int np=1, rank=0; int splitrank, splitsize; int rc = 0; nssi_service xfer_svc; int server_index=0; int rank_in_server=0; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Barrier(MPI_COMM_WORLD); Teuchos::oblackholestream blackhole; std::ostream &out = ( rank == 0 ? std::cout : blackhole ); struct xfer_args args; const int num_io_methods = 8; const int io_method_vals[] = { XFER_WRITE_ENCODE_SYNC, XFER_WRITE_ENCODE_ASYNC, XFER_WRITE_RDMA_SYNC, XFER_WRITE_RDMA_ASYNC, XFER_READ_ENCODE_SYNC, XFER_READ_ENCODE_ASYNC, XFER_READ_RDMA_SYNC, XFER_READ_RDMA_ASYNC}; const char * io_method_names[] = { "write-encode-sync", "write-encode-async", "write-rdma-sync", "write-rdma-async", "read-encode-sync", "read-encode-async", "read-rdma-sync", "read-rdma-async"}; const int num_nssi_transports = 4; const int nssi_transport_vals[] = { NSSI_RPC_PTL, NSSI_RPC_IB, NSSI_RPC_GEMINI, NSSI_RPC_MPI}; const char * nssi_transport_names[] = { "ptl", "ib", "gni", "mpi" }; // Initialize arguments args.transport=NSSI_DEFAULT_TRANSPORT; args.len = 1; args.delay = 1; args.io_method = XFER_WRITE_RDMA_SYNC; args.debug_level = LOG_WARN; args.num_trials = 1; args.num_reqs = 1; args.result_file_mode = "a"; args.result_file = ""; args.url_file = ""; args.logfile = ""; args.client_flag = true; args.server_flag = true; args.num_servers = 1; args.num_threads = 0; args.timeout = 500; args.num_retries = 5; args.validate_flag = true; args.block_distribution = true; bool success = true; /** * We make extensive use of the \ref Teuchos::CommandLineProcessor for command-line * options to control the behavior of the test code. To evaluate performance, * the "num-trials", "num-reqs", and "len" options control the amount of data transferred * between client and server. The "io-method" selects the type of data transfer. The * server-url specifies the URL of the server. If running as a server, the server-url * provides a recommended URL when initializing the network transport. */ try { //out << Teuchos::Teuchos_Version() << std::endl << std::endl; // Creating an empty command line processor looks like: Teuchos::CommandLineProcessor parser; parser.setDocString( "This example program demonstrates a simple data-transfer service " "built using the NEtwork Scalable Service Interface (Nessie)." ); /* To set and option, it must be given a name and default value. Additionally, each option can be given a help std::string. Although it is not necessary, a help std::string aids a users comprehension of the acceptable command line arguments. Some examples of setting command line options are: */ parser.setOption("delay", &args.delay, "time(s) for client to wait for server to start" ); parser.setOption("timeout", &args.timeout, "time(ms) to wait for server to respond" ); parser.setOption("server", "no-server", &args.server_flag, "Run the server" ); parser.setOption("client", "no-client", &args.client_flag, "Run the client"); parser.setOption("len", &args.len, "The number of structures in an input buffer"); parser.setOption("debug",(int*)(&args.debug_level), "Debug level"); parser.setOption("logfile", &args.logfile, "log file"); parser.setOption("num-trials", &args.num_trials, "Number of trials (experiments)"); parser.setOption("num-reqs", &args.num_reqs, "Number of reqs/trial"); parser.setOption("result-file", &args.result_file, "Where to store results"); parser.setOption("result-file-mode", &args.result_file_mode, "Write mode for the result"); parser.setOption("server-url-file", &args.url_file, "File that has URL client uses to find server"); parser.setOption("validate", "no-validate", &args.validate_flag, "Validate the data"); parser.setOption("num-servers", &args.num_servers, "Number of server processes"); parser.setOption("num-threads", &args.num_threads, "Number of threads used by each server process"); parser.setOption("block-distribution", "rr-distribution", &args.block_distribution, "Use a block distribution scheme to assign clients to servers"); // Set an enumeration command line option for the io_method parser.setOption("io-method", &args.io_method, num_io_methods, io_method_vals, io_method_names, "I/O Methods for the example: \n" "\t\t\twrite-encode-sync : Write data through the RPC args, synchronous\n" "\t\t\twrite-encode-async: Write data through the RPC args - asynchronous\n" "\t\t\twrite-rdma-sync : Write data using RDMA (server pulls) - synchronous\n" "\t\t\twrite-rdma-async: Write data using RDMA (server pulls) - asynchronous\n" "\t\t\tread-encode-sync : Read data through the RPC result - synchronous\n" "\t\t\tread-encode-async: Read data through the RPC result - asynchronous\n" "\t\t\tread-rdma-sync : Read data using RDMA (server puts) - synchronous\n" "\t\t\tread-rdma-async: Read data using RDMA (server puts) - asynchronous"); // Set an enumeration command line option for the io_method parser.setOption("transport", &args.transport, num_nssi_transports, nssi_transport_vals, nssi_transport_names, "NSSI transports (not all are available on every platform): \n" "\t\t\tportals : Cray or Schutt\n" "\t\t\tinfiniband : libibverbs\n" "\t\t\tgemini : Cray\n" "\t\t\tmpi : isend/irecv implementation\n" ); /* There are also two methods that control the behavior of the command line processor. First, for the command line processor to allow an unrecognized a command line option to be ignored (and only have a warning printed), use: */ parser.recogniseAllOptions(true); /* Second, by default, if the parser finds a command line option it doesn't recognize or finds the --help option, it will throw an std::exception. If you want prevent a command line processor from throwing an std::exception (which is important in this program since we don't have an try/catch around this) when it encounters a unrecognized option or help is printed, use: */ parser.throwExceptions(false); /* We now parse the command line where argc and argv are passed to the parse method. Note that since we have turned off std::exception throwing above we had better grab the return argument so that we can see what happened and act accordingly. */ Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv ); if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) { return 0; } if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) { return 1; // Error! } // Here is where you would use these command line arguments but for this example program // we will just print the help message with the new values of the command-line arguments. //if (rank == 0) // out << "\nPrinting help message with new values of command-line arguments ...\n\n"; //parser.printHelpMessage(argv[0],out); } TEUCHOS_STANDARD_CATCH_STATEMENTS(true,std::cerr,success); log_debug(args.debug_level, "%d: Finished processing arguments", rank); if (!success) { MPI_Abort(MPI_COMM_WORLD, 1); } if (!args.server_flag && args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.client.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } else if (args.server_flag && !args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.server.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } else if (args.server_flag && args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } log_level debug_level = args.debug_level; // Communicator used for both client and server (may split if using client and server) MPI_Comm comm; log_debug(debug_level, "%d: Starting xfer-service test", rank); #ifdef TRIOS_ENABLE_COMMSPLITTER if (args.transport == NSSI_RPC_MPI) { MPI_Pcontrol(0); } #endif /** * Since this test can be run as a server, client, or both, we need to play some fancy * MPI games to get the communicators working correctly. If we're executing as both * a client and a server, we split the communicator so that the client thinks its * running by itself. */ int color = 0; // color=0-->server, color=1-->client if (args.client_flag && args.server_flag) { if (np < 2) { log_error(debug_level, "Must use at least 2 MPI processes for client and server mode"); MPI_Abort(MPI_COMM_WORLD, -1); } // Split the communicators. Put all the servers as the first ranks. if (rank < args.num_servers) { color = 0; log_debug(debug_level, "rank=%d is a server", rank); } else { color = 1; // all others are clients log_debug(debug_level, "rank=%d is a client", rank); } MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } else { if (args.client_flag) { color=1; log_debug(debug_level, "rank=%d is a client", rank); } else if (args.server_flag) { color=0; log_debug(debug_level, "rank=%d is a server", rank); } else { log_error(debug_level, "Must be either a client or a server"); MPI_Abort(MPI_COMM_WORLD, -1); } MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } MPI_Comm_rank(comm, &splitrank); MPI_Comm_size(comm, &splitsize); log_debug(debug_level, "%d: Finished splitting communicators", rank); /** * Initialize the Nessie interface by specifying a transport, encoding scheme, and a * recommended URL. \ref NSSI_DEFAULT_TRANSPORT is usually the best choice, since it * is often the case that only one type of transport exists on a particular platform. * Currently supported transports are \ref NSSI_RPC_PTL, \ref NSSI_RPC_GNI, and * \ref NSSI_RPC_IB. We only support one type of encoding scheme so NSSI_DEFAULT_ENCODE * should always be used for the second argument. The URL can be specified (as we did for * the server, or NULL (as we did for the client). This is a recommended value. Use the * \ref nssi_get_url function to find the actual value. */ nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, NULL); // Get the Server URL std::string my_url(NSSI_URL_LEN, '\0'); nssi_get_url((nssi_rpc_transport)args.transport, &my_url[0], NSSI_URL_LEN); // If running as both client and server, gather and distribute // the server URLs to all the clients. if (args.server_flag && args.client_flag) { std::string all_urls; // This needs to be a vector of chars, not a string all_urls.resize(args.num_servers * NSSI_URL_LEN, '\0'); // Have servers gather their URLs if (color == 0) { assert(args.num_servers == splitsize); // these should be equal log_debug(debug_level, "%d: Gathering urls: my_url=%s", rank, my_url.c_str()); // gather all urls to rank 0 of the server comm (also rank 0 of MPI_COMM_WORLD) MPI_Gather(&my_url[0], NSSI_URL_LEN, MPI_CHAR, &all_urls[0], NSSI_URL_LEN, MPI_CHAR, 0, comm); } // broadcast the full set of server urls to all processes MPI_Bcast(&all_urls[0], all_urls.size(), MPI_CHAR, 0, MPI_COMM_WORLD); log_debug(debug_level, "%d: Bcast urls, urls.size=%d", rank, all_urls.size()); if (color == 1) { // For block distribution scheme use the utility function (in xfer_util.cpp) if (args.block_distribution) { // Use this utility function to calculate the server_index xfer_block_partition(args.num_servers, splitsize, splitrank, &server_index, &rank_in_server); } // Use a simple round robin distribution scheme else { server_index = splitrank % args.num_servers; rank_in_server = splitrank / args.num_servers; } // Copy the server url out of the list of urls int offset = server_index * NSSI_URL_LEN; args.server_url = all_urls.substr(offset, NSSI_URL_LEN); log_debug(debug_level, "client %d assigned to server \"%s\"", splitrank, args.server_url.c_str()); } log_debug(debug_level, "%d: Finished distributing server urls, server_url=%s", rank, args.server_url.c_str()); } // If running as a client only, have to get the list of servers from the urlfile. else if (!args.server_flag && args.client_flag){ sleep(args.delay); // give server time to get started std::vector< std::string > urlbuf; xfer_read_server_url_file(args.url_file.c_str(), urlbuf, comm); args.num_servers = urlbuf.size(); // For block distribution scheme use the utility function (in xfer_util.cpp) if (args.block_distribution) { // Use this utility function to calculate the server_index xfer_block_partition(args.num_servers, splitsize, splitrank, &server_index, &rank_in_server); } // Use a simple round robin distribution scheme else { server_index = splitrank % args.num_servers; rank_in_server = splitrank / args.num_servers; } args.server_url = urlbuf[server_index]; log_debug(debug_level, "client %d assigned to server \"%s\"", splitrank, args.server_url.c_str()); } else if (args.server_flag && !args.client_flag) { args.server_url = my_url; if (args.url_file.empty()) { log_error(debug_level, "Must set --url-file"); MPI_Abort(MPI_COMM_WORLD, -1); } xfer_write_server_url_file(args.url_file.c_str(), my_url.c_str(), comm); } // Set the debug level for the xfer service. xfer_debug_level = args.debug_level; // Print the arguments after they've all been set. args.io_method_name = std::string(io_method_names[args.io_method]); args.transport_name = std::string(nssi_transport_names[args.transport]); log_debug(debug_level, "%d: server_url=%s", rank, args.server_url.c_str()); print_args(out, args, "%"); log_debug(debug_level, "server_url=%s", args.server_url.c_str()); //------------------------------------------------------------------------------ /** If we're running this job with a server, the server always executes on node 0. * In this example, the server is a single process. */ if (color == 0) { rc = xfer_server_main((nssi_rpc_transport)args.transport, args.num_threads, comm); log_debug(debug_level, "Server is finished"); } // ------------------------------------------------------------------------------ /** The parallel client will execute this branch. The root node, node 0, of the client connects * connects with the server, using the \ref nssi_get_service function. Then the root * broadcasts the service description to the other clients before starting the main * loop of the client code by calling \ref xfer_client_main. */ else { int i; int client_rank; // get rank within the client communicator MPI_Comm_rank(comm, &client_rank); nssi_init((nssi_rpc_transport)args.transport); // Only one process needs to connect to the service // TODO: Make get_service a collective call (some transports do not need a connection) //if (client_rank == 0) { { // connect to remote server for (i=0; i < args.num_retries; i++) { log_debug(debug_level, "Try to connect to server: attempt #%d, url=%s", i, args.server_url.c_str()); rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url.c_str(), args.timeout, &xfer_svc); if (rc == NSSI_OK) break; else if (rc != NSSI_ETIMEDOUT) { log_error(xfer_debug_level, "could not get svc description: %s", nssi_err_str(rc)); break; } } } // wait for all the clients to connect MPI_Barrier(comm); //MPI_Bcast(&rc, 1, MPI_INT, 0, comm); if (rc == NSSI_OK) { if (client_rank == 0) log_debug(debug_level, "Connected to service on attempt %d\n", i); // Broadcast the service description to the other clients //log_debug(xfer_debug_level, "Bcasting svc to other clients"); //MPI_Bcast(&xfer_svc, sizeof(nssi_service), MPI_BYTE, 0, comm); log_debug(debug_level, "Starting client main"); // Start the client code xfer_client_main(args, xfer_svc, comm); MPI_Barrier(comm); // Tell one of the clients to kill the server if (rank_in_server == 0) { log_debug(debug_level, "%d: Halting xfer service", rank); rc = nssi_kill(&xfer_svc, 0, 5000); } } else { if (client_rank == 0) log_error(debug_level, "Failed to connect to service after %d attempts: ABORTING", i); success = false; //MPI_Abort(MPI_COMM_WORLD, -1); } nssi_fini((nssi_rpc_transport)args.transport); } log_debug(debug_level, "%d: clean up nssi", rank); MPI_Barrier(MPI_COMM_WORLD); // Clean up nssi_rpc rc = nssi_rpc_fini((nssi_rpc_transport)args.transport); if (rc != NSSI_OK) log_error(debug_level, "Error in nssi_rpc_fini"); log_debug(debug_level, "%d: MPI_Finalize()", rank); MPI_Finalize(); logger_fini(); if(success && (rc == NSSI_OK)) out << "\nEnd Result: TEST PASSED" << std::endl; else out << "\nEnd Result: TEST FAILED" << std::endl; return ((success && (rc==NSSI_OK)) ? 0 : 1 ); }
void trainOneEpochDenseCPU(int itask, float *data, float *numerator, float *denominator, float *codebook, unsigned int nSomX, unsigned int nSomY, unsigned int nDimensions, unsigned int nVectors, unsigned int nVectorsPerRank, float radius, float scale, string mapType, int *globalBmus) { unsigned int p1[2] = {0, 0}; unsigned int *bmus = new unsigned int[nVectorsPerRank*2]; #pragma omp parallel default(shared) private(p1) { #pragma omp for for (unsigned int n = 0; n < nVectorsPerRank; n++) { if (itask*nVectorsPerRank+n<nVectors) { /// get the best matching unit get_bmu_coord(codebook, data, nSomY, nSomX, nDimensions, p1, n); bmus[2*n] = p1[0]; bmus[2*n+1] = p1[1]; } } } float *localNumerator = new float[nSomY*nSomX*nDimensions]; float *localDenominator = new float[nSomY*nSomX]; #pragma omp parallel default(shared) { #pragma omp for for (unsigned int som_y = 0; som_y < nSomY; som_y++) { for (unsigned int som_x = 0; som_x < nSomX; som_x++) { localDenominator[som_y*nSomX + som_x] = 0.0; for (unsigned int d = 0; d < nDimensions; d++) localNumerator[som_y*nSomX*nDimensions + som_x*nDimensions + d] = 0.0; } } /// Accumulate denoms and numers #pragma omp for for (unsigned int som_y = 0; som_y < nSomY; som_y++) { for (unsigned int som_x = 0; som_x < nSomX; som_x++) { for (unsigned int n = 0; n < nVectorsPerRank; n++) { if (itask*nVectorsPerRank+n<nVectors) { float dist = 0.0f; if (mapType == "planar") { dist = euclideanDistanceOnPlanarMap(som_x, som_y, bmus[2*n], bmus[2*n+1]); } else if (mapType == "toroid") { dist = euclideanDistanceOnToroidMap(som_x, som_y, bmus[2*n], bmus[2*n+1], nSomX, nSomY); } float neighbor_fuct = getWeight(dist, radius, scale); for (unsigned int d = 0; d < nDimensions; d++) { localNumerator[som_y*nSomX*nDimensions + som_x*nDimensions + d] += 1.0f * neighbor_fuct * (*(data + n*nDimensions + d)); } localDenominator[som_y*nSomX + som_x] += neighbor_fuct; } } } } } #ifdef HAVE_MPI MPI_Reduce(localNumerator, numerator, nSomY*nSomX*nDimensions, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(localDenominator, denominator, nSomY*nSomX, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Gather(bmus, nVectorsPerRank*2, MPI_INT, globalBmus, nVectorsPerRank*2, MPI_INT, 0, MPI_COMM_WORLD); #else for (unsigned int i=0; i < nSomY*nSomX*nDimensions; ++i) { numerator[i] = localNumerator[i]; } for (unsigned int i=0; i < nSomY*nSomX; ++i) { denominator[i] = localDenominator[i]; } for (unsigned int i=0; i < 2*nVectorsPerRank; ++i) { globalBmus[i]=bmus[i]; } #endif delete [] bmus; delete [] localNumerator; delete [] localDenominator; }
int main(int argc, char** argv) { // Initialize the MPI environment MPI_Init(&argc, &argv); // Get the number of processes int world_size; MPI_Comm_size(MPI_COMM_WORLD, &world_size); // Get the rank of the process int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); char string_buffer[LEN]; char* rbuf = NULL; if(world_rank == 0) rbuf = malloc(world_size * LEN * sizeof(char)); // Get the name of the processor char processor_name[50]; int name_len = 50; if(gethostname(processor_name, name_len) != 0) { printf("Error with hostname"); exit(1); } // Get current time on host struct timeval time; if(gettimeofday(&time, NULL) != 0) { printf("Error with time"); exit(1); } // Generate output time_t curtime = time.tv_sec; char time_buffer[30]; strftime(time_buffer, 30, "%Y-%m-%d %T.", localtime(&curtime)); sprintf(string_buffer, "%s: %s%li", processor_name, time_buffer, time.tv_usec); // Gather output int rc = MPI_Gather(string_buffer, LEN, MPI_CHAR, rbuf, LEN, MPI_CHAR, 0, MPI_COMM_WORLD); if(rc != MPI_SUCCESS) { printf("Error while gathering, rc is: %d", rc); exit(1); } // Print output if(world_rank == 0) { for(int i = 0; i < world_size; ++i) printf("%.*s\n", LEN, rbuf + LEN * i); } // Get microseconds int usec = time.tv_usec; int * rbuf_usec; if(world_rank == 0) rbuf_usec = malloc(world_size * sizeof(int)); // Reduce microseconds if(MPI_Reduce(&usec, rbuf_usec, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD) != MPI_SUCCESS){ printf("Error in MPI_Reduce\n"); exit(1); } // Print microseconds if(world_rank == 0) printf("%d\n", usec); if(MPI_Barrier(MPI_COMM_WORLD) != MPI_SUCCESS){ printf("Error with barrier"); exit(1); } printf("Rang %d beendet jetzt!\n", world_rank); // Finalize the MPI environment. MPI_Finalize(); }
//----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- void Image_Exchanger::sync_fragment_info(OverLap_FootPrint* ofp, ImageFragment_Tile* ift, int nviewer) { #ifdef _DEBUG7 fprintf(stderr, "**** %s:%s() ****\n", __FILE__, __func__); #endif std::vector<int> infobuf; int count = ofp->save_overlap_info(infobuf); #ifdef _DEBUG6 fprintf(stderr, "%d: %s: olcount=%d, olbuffer size=%ld\n", m_rank, __func__, count, infobuf.size()); #endif int c = infobuf.size(); // fprintf(stderr, "%d: nviewer=%d, gather MPI_INT %d\n", // m_rank, nviewer, c); memset(m_rcounts, 0, m_runsize*sizeof(unsigned int)); if(nviewer == 1) { MPI_Gather(&c, 1, MPI_INT, m_rcounts, 1, MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Allgather(&c, 1, MPI_INT, m_rcounts, 1, MPI_INT, MPI_COMM_WORLD); } // vector throws a length_error if resized above max_size //terminate called after throwing an instance of 'std::length_error' //what(): vector::_M_fill_insert std::vector<int> ainfobuf(1, 0); memset(m_rdispls, 0, m_runsize*sizeof(unsigned int)); if( (nviewer == 1 && m_rank==0) || (nviewer > 1) ) { int total = 0; for(int i=0; i<m_runsize; i++) total += m_rcounts[i]; // fprintf(stderr, "std::vector max size=%ld, resize to %d\n", // ainfobuf.max_size(), total); assert(total > 0); ainfobuf.resize(total, 0); } for(int i=0; i<m_runsize-1; i++) m_rdispls[i+1] = m_rdispls[i] + m_rcounts[i]; //to make &infobuf[0] a legal call if(c == 0) infobuf.resize(1); if(nviewer == 1) { MPI_Gatherv(&infobuf[0], c, MPI_INT, &ainfobuf[0], m_rcounts, m_rdispls, MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Allgatherv(&infobuf[0], c, MPI_INT, &ainfobuf[0], (int*)m_rcounts, (int*)m_rdispls, MPI_INT, MPI_COMM_WORLD); } //fprintf(stderr, "MPI_SUCCESS on sync frag info\n"); //only viewer need to have all fragments and count for recv //non-viewer only need count send for its own fragments if(m_rank < nviewer) { ift->retrieve_fragments(ainfobuf); } else if(c > 0) { ift->retrieve_fragments(infobuf); } }
int main(int argc, char **argv) { int i, j, k; double start, end; /* Time array */ double time[9]; double comm_time = 0; double comp_time = 0; int chunkSize; MPI_Status status; /* Being used in FFT */ float data[N][N]; /* Being used in mm */ float input_1[N][N], input_2[N][N]; /* Local matrix for FFT */ float local_data[N][N]; /* World rank and processor, related to MPI_COMM_WORLD */ int world_id; int world_processor; /* Divided rank and processors for communication, related to taskcomm */ int task_id; int task_processor; /* A complex array storing the temp row to operate FFT */ complex temp_data[N]; /* Initialize rank and the number of processor for the MPI */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &world_id); MPI_Comm_size(MPI_COMM_WORLD, &world_processor); /* Initialize a new vector for distributing columns */ MPI_Datatype column, col; /* Column vector */ MPI_Type_vector(N, 1, N, MPI_FLOAT, &col); MPI_Type_commit(&col); MPI_Type_create_resized(col, 0, 1*sizeof(float), &column); MPI_Type_commit(&column); int task = world_id%4; MPI_Comm taskcomm; /* Split the MPI_COMM_WORLD */ MPI_Comm_split(MPI_COMM_WORLD, task, world_id, &taskcomm); MPI_Comm_rank(taskcomm, &task_id); MPI_Comm_size(taskcomm, &task_processor); /* Initialize inter communicators */ MPI_Comm t1_t3_comm, t2_t3_comm, t3_t4_comm; /* Calculate chunkSize */ chunkSize = N/task_processor; /* Get the start time of all program */ if(world_id == 0){ printf("2D convolution using MPI task and data parallelism\n"); start = MPI_Wtime(); } /* Each group completes work and send results by inter communicators */ if(task == 0){ // task 1 /* Create an inter communicator for task 1 and task 3 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 1, &t1_t3_comm); if(task_id == 0){ time[0] = MPI_Wtime(); /* Read file */ readIm1File(data); time[1] = MPI_Wtime(); printf("Group 1: Reading file 1_im1 takes %f s.\n", time[1] - time[0]); } /* Scatter data to local ranks */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Compute time for distributing data */ if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 1: Scattering 1_im1(row) to each processor takes %f s.\n", time[2] - time[1]); } /* Do 1_im1 2d FFT */ /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 1: FFT each row for 1_im1 takes %f s.\n", time[3] - time[2]); } /* Gather all the data of 1_im1 */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 1: Gathering all the data of 1_im1(row) takes %f s.\n", time[4] - time[3]); } /* Scatter all the data to column local data */ MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 1: Scattering 1_im1(column) to each processor takes %f s.\n", time[5] - time[4]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 1: FFT each column for 1_im1 takes %f s.\n", time[6] - time[5]); } MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); /* Compute time and distribute data to do matrix multiplication */ if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 1: Gathering all the data of 1_im1(column) takes %f s.\n", time[7] - time[6]); /* Total time */ printf("Group 1: Total time for task 1 in group 1 takes %f s.\n", time[7] - time[0]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1]; comp_time += time[6] - time[5] + time[3] - time[2]; /* Send data to group 3 via the inter communicator */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm); } } else if(task == 1){ // Task 2 /* Create an inter communicator for task 2 and task 3 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 2, &t2_t3_comm); if(task_id == 0){ time[0] = MPI_Wtime(); /* Read file */ readIm2File(data); time[1] = MPI_Wtime(); printf("Group 2: Reading file 1_im2 takes %f s.\n", time[1] - time[0]); } /* Scatter data to local ranks */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Compute time for distributing data */ if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 2: Scatter 1_im2(row) to each processor takes %f s.\n", time[2] - time[1]); } /* Do 1_im1 2d FFT */ /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 2: FFT each row for 1_im2 takes %f s.\n", time[3] - time[2]); } /* Gather all the data of 1_im1 */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 2: Gather all the data of 1_im2(row) takes %f s.\n", time[4] - time[3]); } /* Scatter all the data to column local data */ MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 2: Scatter 1_im2(column) to each processor takes %f s.\n", time[5] - time[4]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 2: FFT each column for 1_im2 takes %f s.\n", time[6] - time[5]); } MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); /* Compute time and distribute data to do matrix multiplication */ if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 2: Gather all the data of 1_im2(column) takes %f s.\n", time[7] - time[6]); /* Total time */ printf("Group 2: Total time for task 2 in group 2 takes %f s.\n", time[7] - time[0]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1]; comp_time += time[6] - time[5] + time[3] - time[2]; /* Send data to group 3 via the inter communicator */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm); } } else if(task == 2){ // Task 3 /* Local matrix for matrix multiplication */ float local_data2[chunkSize][N]; /* Create inter communicators for task 1 and task3, task 2 and task 3, task 3 and task 4 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 0, 1, &t1_t3_comm); MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 1, 2, &t2_t3_comm); MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 3, 3, &t3_t4_comm); /* Receive data from group 1 and group 2 */ if(task_id == 0){ time[0] = MPI_Wtime(); MPI_Recv(input_1, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm, &status); MPI_Recv(input_2, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm, &status); time[1] = MPI_Wtime(); /* Time of receiving data from group 1 and group 2 */ printf("Group 3: Receiving data from group 1 and group 2 takes %f s.\n", time[1] - time[0]); } /* Do matrix multiplication */ MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Broadcast data2 to all the ranks */ MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 3: Scattering data for multiplication takes %f s.\n", time[2] - time[1]); } /* Matrix multiplication */ for(i = 0; i < chunkSize; i++) for(j = 0; j < N; j++){ local_data2[i][j] = 0; for(k = 0; k < N; k++) local_data2[i][j] += local_data[i][k]*input_2[k][j]; } /* Collect multiplication result from each rank */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 3: Matrix multiplication takes %f s.\n", time[3] - time[2]); } /* Gather data */ MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 3: Gathering data after Matrix multiplication takes %f s.\n", time[4] - time[3]); /* total time */ printf("Group 3: Total time for task 3 in group 3 takes %f s.\n", time[4] - time[0]); /* send result of matrix multiplication to group 4 */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm); } comm_time += time[4] - time[3] + time[2] - time[0]; comp_time += time[3] - time[2]; MPI_Comm_free(&t1_t3_comm); MPI_Comm_free(&t2_t3_comm); } else{ // Task 4 /* Create an inter communicator for task 3 and task 4 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 3, &t3_t4_comm); /* Receive data from group 3 */ if(task_id == 0){ time[0] = MPI_Wtime(); MPI_Recv(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm, &status); time[1] = MPI_Wtime(); printf("Group 4: Receiving data from group 3 takes %f s.\n", time[1] - time[0]); } /* Scatter data to each processor */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 4: Scattering data of rows to each processor takes %f s.\n", time[2] - time[1]); } /* Inverse-2DFFT(row) */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 4: Inverse-2DFFT(row) takes %f s.\n", time[3] - time[2]); } /* Gather all the data */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 4: Gathering data of Inverse-2DFFT(row) takes %f s.\n", time[4] - time[3]); } MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 4: Scattering data of columns to each processor takes %f s.\n", time[5] - time[4]); } /* Inverse-2DFFT(column) for output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 4: Inverse-2DFFT(column) takes %f s.\n", time[6] - time[5]); } /* Gather all the columns of output file from each rank */ MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 4: Gathering data of Inverse-2DFFT(column) takes %f s.\n", time[7] - time[6]); writeFile(data); time[8] = MPI_Wtime(); printf("Group 4: Writing file to out_1 takes %f s.\n", time[8] - time[7]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[0]; comp_time += time[6] - time[5] + time[3] - time[2]; } MPI_Comm_free(&t3_t4_comm); } MPI_Barrier(MPI_COMM_WORLD); if(world_id == 0){ end = MPI_Wtime(); printf("Total communication time of 2D convolution using MPI task parallel takes %f s.\n", comm_time); printf("Total computing time of 2D convolution using MPI task parallel takes %f s.\n", comp_time); printf("Total running time without loading/writing of 2D convolution using MPI task parallel takes %f s.\n", comm_time + comp_time); printf("Total running time of 2D convolution using MPI task parallel takes %f s.\n", end - start); } /* Free vector and task comm */ MPI_Type_free(&column); MPI_Type_free(&col); MPI_Comm_free(&taskcomm); MPI_Finalize(); return 0; }
void online_measurement(const int traj, const int id, const int ieo) { int i, j, t, tt, t0; double *Cpp = NULL, *Cpa = NULL, *Cp4 = NULL; double res = 0., respa = 0., resp4 = 0.; double atime, etime; float tmp; operator * optr; #ifdef MPI double mpi_res = 0., mpi_respa = 0., mpi_resp4 = 0.; // send buffer for MPI_Gather double *sCpp = NULL, *sCpa = NULL, *sCp4 = NULL; #endif FILE *ofs; char *filename; char buf[100]; spinor phi; filename=buf; sprintf(filename,"%s%.6d", "onlinemeas." ,traj); init_operators(); if(no_operators < 1 && g_proc_id == 0) { if(g_proc_id == 0) { fprintf(stderr, "Warning! no operators defined in input file, cannot perform online correlator mesurements!\n"); } return; } if(no_operators > 1 && g_proc_id == 0) { fprintf(stderr, "Warning! number of operators defined larger than 1, using only the first!\n"); } optr = &operator_list[0]; // we don't want to do inversion twice for this purpose here optr->DownProp = 0; if(optr->type != TMWILSON && optr->type != WILSON && optr->type != CLOVER) { if(g_proc_id == 0) { fprintf(stderr, "Warning! correlator online measurement currently only implemented for TMWILSON, WILSON and CLOVER\n"); fprintf(stderr, "Cannot perform online measurement!\n"); } return; } /* generate random timeslice */ if(ranlxs_init == 0) { rlxs_init(1, 123456); } ranlxs(&tmp, 1); t0 = (int)(measurement_list[id].max_source_slice*tmp); #ifdef MPI MPI_Bcast(&t0, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif if(g_debug_level > 1 && g_proc_id == 0) { printf("# timeslice set to %d (T=%d) for online measurement\n", t0, g_nproc_t*T); printf("# online measurements parameters: kappa = %g, mu = %g\n", g_kappa, g_mu/2./g_kappa); } atime = gettime(); #ifdef MPI sCpp = (double*) calloc(T, sizeof(double)); sCpa = (double*) calloc(T, sizeof(double)); sCp4 = (double*) calloc(T, sizeof(double)); if(g_mpi_time_rank == 0) { Cpp = (double*) calloc(g_nproc_t*T, sizeof(double)); Cpa = (double*) calloc(g_nproc_t*T, sizeof(double)); Cp4 = (double*) calloc(g_nproc_t*T, sizeof(double)); } #else Cpp = (double*) calloc(T, sizeof(double)); Cpa = (double*) calloc(T, sizeof(double)); Cp4 = (double*) calloc(T, sizeof(double)); #endif source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], t0, 0, traj); optr->sr0 = g_spinor_field[0]; optr->sr1 = g_spinor_field[1]; optr->prop0 = g_spinor_field[2]; optr->prop1 = g_spinor_field[3]; // op_id = 0, index_start = 0, write_prop = 0 optr->inverter(0, 0, 0); /* now we bring it to normal format */ /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */ convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]); /* now we sum only over local space for every t */ for(t = 0; t < T; t++) { j = g_ipt[t][0][0][0]; res = 0.; respa = 0.; resp4 = 0.; for(i = j; i < j+LX*LY*LZ; i++) { res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][j], g_spinor_field[DUM_MATRIX][j]); _gamma0(phi, g_spinor_field[DUM_MATRIX][j]); respa += _spinor_prod_re(g_spinor_field[DUM_MATRIX][j], phi); _gamma5(phi, phi); resp4 += _spinor_prod_im(g_spinor_field[DUM_MATRIX][j], phi); } #if defined MPI MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices); res = mpi_res; MPI_Reduce(&respa, &mpi_respa, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices); respa = mpi_respa; MPI_Reduce(&resp4, &mpi_resp4, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices); resp4 = mpi_resp4; sCpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)*2.; sCpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)*2.; sCp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)*2.; #else Cpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)*2.; Cpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)*2.; Cp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)*2.; #endif } #ifdef MPI /* some gymnastics needed in case of parallelisation */ if(g_mpi_time_rank == 0) { MPI_Gather(sCpp, T, MPI_DOUBLE, Cpp, T, MPI_DOUBLE, 0, g_mpi_SV_slices); MPI_Gather(sCpa, T, MPI_DOUBLE, Cpa, T, MPI_DOUBLE, 0, g_mpi_SV_slices); MPI_Gather(sCp4, T, MPI_DOUBLE, Cp4, T, MPI_DOUBLE, 0, g_mpi_SV_slices); } #endif /* and write everything into a file */ if(g_mpi_time_rank == 0 && g_proc_coords[0] == 0) { ofs = fopen(filename, "w"); fprintf( ofs, "1 1 0 %e %e\n", Cpp[t0], 0.); for(t = 1; t < g_nproc_t*T/2; t++) { tt = (t0+t)%(g_nproc_t*T); fprintf( ofs, "1 1 %d %e ", t, Cpp[tt]); tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T); fprintf( ofs, "%e\n", Cpp[tt]); } tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T); fprintf( ofs, "1 1 %d %e %e\n", t, Cpp[tt], 0.); fprintf( ofs, "2 1 0 %e %e\n", Cpa[t0], 0.); for(t = 1; t < g_nproc_t*T/2; t++) { tt = (t0+t)%(g_nproc_t*T); fprintf( ofs, "2 1 %d %e ", t, Cpa[tt]); tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T); fprintf( ofs, "%e\n", Cpa[tt]); } tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T); fprintf( ofs, "2 1 %d %e %e\n", t, Cpa[tt], 0.); fprintf( ofs, "6 1 0 %e %e\n", Cp4[t0], 0.); for(t = 1; t < g_nproc_t*T/2; t++) { tt = (t0+t)%(g_nproc_t*T); fprintf( ofs, "6 1 %d %e ", t, Cp4[tt]); tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T); fprintf( ofs, "%e\n", Cp4[tt]); } tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T); fprintf( ofs, "6 1 %d %e %e\n", t, Cp4[tt], 0.); fclose(ofs); } #ifdef MPI if(g_mpi_time_rank == 0) { free(Cpp); free(Cpa); free(Cp4); } free(sCpp); free(sCpa); free(sCp4); #else free(Cpp); free(Cpa); free(Cp4); #endif etime = gettime(); if(g_proc_id == 0 && g_debug_level > 0) { printf("ONLINE: measurement done int t/s = %1.4e\n", etime - atime); } return; }
int main( int argc, char * argv[]) { //int argc; //char *argv; int th_id; int num_th; char *relative_path_to_the_input_file; char *relative_path_to_the_output_file; relative_path_to_the_input_file = argv[1]; relative_path_to_the_output_file= argv[2]; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &th_id ); MPI_Comm_size( MPI_COMM_WORLD, &num_th ); // **** Your SPMD program goes here **** char lines[100000][15]; unsigned int slice_size = 100000 / num_th; char line_buffer[15]; char lines_slice[slice_size][15]; char search_string[15]; int gather_array[num_th][2]; int i; double t1 = MPI_Wtime(); // start timer // populate large array if (0 == th_id) { FILE *input_file; if (NULL != relative_path_to_the_input_file) input_file = fopen(relative_path_to_the_input_file, "r"); else input_file = fopen("partA.txt", "r"); fgets(line_buffer, 15, input_file); // skip process amount fgets(line_buffer, 15, input_file); // skip slice amount fgets(search_string, 15, input_file); for (i = 0; i < 100000; ++i) fgets(lines[i], 15, input_file); for (i = 1; i < num_th; ++i) MPI_Send(&search_string, 15, MPI_CHAR, i, 0, MPI_COMM_WORLD); // send search string to other processes fclose(input_file); } // other processes receive search string else { MPI_Status status; MPI_Recv(&search_string, 15, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status); } // distribute lines array to processes MPI_Scatter(&lines, slice_size * 15, MPI_CHAR, &lines_slice, slice_size * 15, MPI_CHAR, 0, MPI_COMM_WORLD); // processes search it's given small array int data[2]; data[0] = th_id; data[1] = -1; for (i = 0; i < slice_size; ++i) if (!strcmp(search_string, lines_slice[i])) { data[1] = (th_id * slice_size) + i; break; } // gather result back to process 0 (master) MPI_Gather(&data, 2, MPI_INT, &gather_array, 2, MPI_INT, 0, MPI_COMM_WORLD); // End Timer double t2 = MPI_Wtime(); // process 0 (master) writes results to file if (th_id == 0) { //FILE *output_file; //if (NULL != relative_path_to_the_output_file) // output_file = fopen(relative_path_to_the_output_file, "a"); //else // output_file = fopen("outputA.txt", "a"); for (i = 0; i < num_th; ++i) { if (-1 != gather_array[i][1]) { //fprintf(output_file, "Process %d, found yes, slice %d, position %d\n", gather_array[i], gather_array[i], gather_array[i + 1]); printf("Process %d, found yes, slice %d, position %d\n", gather_array[i][0], gather_array[i][0], gather_array[i][1]); } else { //fprintf(output_file, "Process %d, found no, slice -1, position -1\n", gather_array[i]); printf("Process %d, found no, slice -1, position -1\n", gather_array[i][0]); } } //fprintf(output_file, "Total execution time: %d ms\n", t2-t1); //printf("t1: %f, t2: %f\n", t1, t2); printf("Total execution time: %f ms\n", (t2-t1) * 1000); //fclose(output_file); } // Finish Processes MPI_Finalize(); return 0; }
int hypre_BoomerAMGSetupStats( void *amg_vdata, hypre_ParCSRMatrix *A ) { MPI_Comm comm = hypre_ParCSRMatrixComm(A); hypre_ParAMGData *amg_data = amg_vdata; /*hypre_SeqAMGData *seq_data = hypre_ParAMGDataSeqData(amg_data);*/ /* Data Structure variables */ hypre_ParCSRMatrix **A_array; hypre_ParCSRMatrix **P_array; hypre_CSRMatrix *A_diag; double *A_diag_data; int *A_diag_i; hypre_CSRMatrix *A_offd; double *A_offd_data; int *A_offd_i; hypre_CSRMatrix *P_diag; double *P_diag_data; int *P_diag_i; hypre_CSRMatrix *P_offd; double *P_offd_data; int *P_offd_i; int numrows; HYPRE_BigInt *row_starts; int num_levels; int coarsen_type; int interp_type; int measure_type; double global_nonzeros; double *send_buff; double *gather_buff; /* Local variables */ int level; int j; HYPRE_BigInt fine_size; int min_entries; int max_entries; int num_procs,my_id, num_threads; double min_rowsum; double max_rowsum; double sparse; int i; HYPRE_BigInt coarse_size; int entries; double avg_entries; double rowsum; double min_weight; double max_weight; int global_min_e; int global_max_e; double global_min_rsum; double global_max_rsum; double global_min_wt; double global_max_wt; double *num_coeffs; double *num_variables; double total_variables; double operat_cmplxty; double grid_cmplxty; /* amg solve params */ int max_iter; int cycle_type; int *num_grid_sweeps; int *grid_relax_type; int relax_order; int **grid_relax_points; double *relax_weight; double *omega; double tol; int one = 1; int minus_one = -1; int zero = 0; int smooth_type; int smooth_num_levels; int agg_num_levels; /*int seq_cg = 0;*/ /*if (seq_data) seq_cg = 1;*/ MPI_Comm_size(comm, &num_procs); MPI_Comm_rank(comm,&my_id); num_threads = hypre_NumThreads(); if (my_id == 0) printf("\nNumber of MPI processes: %d , Number of OpenMP threads: %d\n", num_procs, num_threads); A_array = hypre_ParAMGDataAArray(amg_data); P_array = hypre_ParAMGDataPArray(amg_data); num_levels = hypre_ParAMGDataNumLevels(amg_data); coarsen_type = hypre_ParAMGDataCoarsenType(amg_data); interp_type = hypre_ParAMGDataInterpType(amg_data); measure_type = hypre_ParAMGDataMeasureType(amg_data); smooth_type = hypre_ParAMGDataSmoothType(amg_data); smooth_num_levels = hypre_ParAMGDataSmoothNumLevels(amg_data); agg_num_levels = hypre_ParAMGDataAggNumLevels(amg_data); /*---------------------------------------------------------- * Get the amg_data data *----------------------------------------------------------*/ num_levels = hypre_ParAMGDataNumLevels(amg_data); max_iter = hypre_ParAMGDataMaxIter(amg_data); cycle_type = hypre_ParAMGDataCycleType(amg_data); num_grid_sweeps = hypre_ParAMGDataNumGridSweeps(amg_data); grid_relax_type = hypre_ParAMGDataGridRelaxType(amg_data); grid_relax_points = hypre_ParAMGDataGridRelaxPoints(amg_data); relax_weight = hypre_ParAMGDataRelaxWeight(amg_data); relax_order = hypre_ParAMGDataRelaxOrder(amg_data); omega = hypre_ParAMGDataOmega(amg_data); tol = hypre_ParAMGDataTol(amg_data); /*block_mode = hypre_ParAMGDataBlockMode(amg_data);*/ send_buff = hypre_CTAlloc(double, 6); #ifdef HYPRE_NO_GLOBAL_PARTITION gather_buff = hypre_CTAlloc(double,6); #else gather_buff = hypre_CTAlloc(double,6*num_procs); #endif if (my_id==0) { printf("\nBoomerAMG SETUP PARAMETERS:\n\n"); printf(" Max levels = %d\n",hypre_ParAMGDataMaxLevels(amg_data)); printf(" Num levels = %d\n\n",num_levels); printf(" Strength Threshold = %f\n", hypre_ParAMGDataStrongThreshold(amg_data)); printf(" Interpolation Truncation Factor = %f\n", hypre_ParAMGDataTruncFactor(amg_data)); printf(" Maximum Row Sum Threshold for Dependency Weakening = %f\n\n", hypre_ParAMGDataMaxRowSum(amg_data)); if (coarsen_type == 0) { printf(" Coarsening Type = Cleary-Luby-Jones-Plassman\n"); } else if (abs(coarsen_type) == 1) { printf(" Coarsening Type = Ruge\n"); } else if (abs(coarsen_type) == 2) { printf(" Coarsening Type = Ruge2B\n"); } else if (abs(coarsen_type) == 3) { printf(" Coarsening Type = Ruge3\n"); } else if (abs(coarsen_type) == 4) { printf(" Coarsening Type = Ruge 3c \n"); } else if (abs(coarsen_type) == 5) { printf(" Coarsening Type = Ruge relax special points \n"); } else if (abs(coarsen_type) == 6) { printf(" Coarsening Type = Falgout-CLJP \n"); } else if (abs(coarsen_type) == 8) { printf(" Coarsening Type = PMIS \n"); } else if (abs(coarsen_type) == 10) { printf(" Coarsening Type = HMIS \n"); } else if (abs(coarsen_type) == 11) { printf(" Coarsening Type = Ruge 1st pass only \n"); } else if (abs(coarsen_type) == 9) { printf(" Coarsening Type = PMIS fixed random \n"); } else if (abs(coarsen_type) == 7) { printf(" Coarsening Type = CLJP, fixed random \n"); } if (coarsen_type > 0) { printf(" Hybrid Coarsening (switch to CLJP when coarsening slows)\n"); } if (coarsen_type) printf(" measures are determined %s\n\n", (measure_type ? "globally" : "locally")); if (agg_num_levels) printf(" no. of levels of aggressive coarsening: %d\n\n", agg_num_levels); #ifdef HYPRE_NO_GLOBAL_PARTITION printf( "\n No global partition option chosen.\n\n"); #endif if (interp_type == 0) { printf(" Interpolation = modified classical interpolation\n"); } else if (interp_type == 1) { printf(" Interpolation = LS interpolation \n"); } else if (interp_type == 2) { printf(" Interpolation = modified classical interpolation for hyperbolic PDEs\n"); } else if (interp_type == 3) { printf(" Interpolation = direct interpolation with separation of weights\n"); } else if (interp_type == 4) { printf(" Interpolation = multipass interpolation\n"); } else if (interp_type == 5) { printf(" Interpolation = multipass interpolation with separation of weights\n"); } else if (interp_type == 6) { printf(" Interpolation = extended+i interpolation\n"); } else if (interp_type == 7) { printf(" Interpolation = extended+i interpolation (only when needed)\n"); } else if (interp_type == 8) { printf(" Interpolation = standard interpolation\n"); } else if (interp_type == 9) { printf(" Interpolation = standard interpolation with separation of weights\n"); } else if (interp_type == 12) { printf(" FF interpolation \n"); } else if (interp_type == 13) { printf(" FF1 interpolation \n"); } { printf( "\nOperator Matrix Information:\n\n"); } #if HYPRE_LONG_LONG printf(" nonzero entries p"); printf("er row row sums\n"); printf("lev rows entries sparse min max "); printf("avg min max\n"); printf("======================================="); printf("==================================\n"); #else printf(" nonzero entries p"); printf("er row row sums\n"); printf("lev rows entries sparse min max "); printf("avg min max\n"); printf("======================================="); printf("============================\n"); #endif } /*----------------------------------------------------- * Enter Statistics Loop *-----------------------------------------------------*/ num_coeffs = hypre_CTAlloc(double,num_levels); num_variables = hypre_CTAlloc(double,num_levels); for (level = 0; level < num_levels; level++) { { A_diag = hypre_ParCSRMatrixDiag(A_array[level]); A_diag_data = hypre_CSRMatrixData(A_diag); A_diag_i = hypre_CSRMatrixI(A_diag); A_offd = hypre_ParCSRMatrixOffd(A_array[level]); A_offd_data = hypre_CSRMatrixData(A_offd); A_offd_i = hypre_CSRMatrixI(A_offd); row_starts = hypre_ParCSRMatrixRowStarts(A_array[level]); fine_size = hypre_ParCSRMatrixGlobalNumRows(A_array[level]); global_nonzeros = hypre_ParCSRMatrixDNumNonzeros(A_array[level]); num_coeffs[level] = global_nonzeros; num_variables[level] = (double) fine_size; sparse = global_nonzeros /((double) fine_size * (double) fine_size); min_entries = 0; max_entries = 0; min_rowsum = 0.0; max_rowsum = 0.0; if (hypre_CSRMatrixNumRows(A_diag)) { min_entries = (A_diag_i[1]-A_diag_i[0])+(A_offd_i[1]-A_offd_i[0]); for (j = A_diag_i[0]; j < A_diag_i[1]; j++) min_rowsum += A_diag_data[j]; for (j = A_offd_i[0]; j < A_offd_i[1]; j++) min_rowsum += A_offd_data[j]; max_rowsum = min_rowsum; for (j = 0; j < hypre_CSRMatrixNumRows(A_diag); j++) { entries = (A_diag_i[j+1]-A_diag_i[j])+(A_offd_i[j+1]-A_offd_i[j]); min_entries = hypre_min(entries, min_entries); max_entries = hypre_max(entries, max_entries); rowsum = 0.0; for (i = A_diag_i[j]; i < A_diag_i[j+1]; i++) rowsum += A_diag_data[i]; for (i = A_offd_i[j]; i < A_offd_i[j+1]; i++) rowsum += A_offd_data[i]; min_rowsum = hypre_min(rowsum, min_rowsum); max_rowsum = hypre_max(rowsum, max_rowsum); } } avg_entries = global_nonzeros / ((double) fine_size); } #ifdef HYPRE_NO_GLOBAL_PARTITION numrows = (int)(row_starts[1]-row_starts[0]); if (!numrows) /* if we don't have any rows, then don't have this count toward min row sum or min num entries */ { min_entries = 1000000; min_rowsum = 1.0e7; } send_buff[0] = - (double) min_entries; send_buff[1] = (double) max_entries; send_buff[2] = - min_rowsum; send_buff[3] = max_rowsum; MPI_Reduce(send_buff, gather_buff, 4, MPI_DOUBLE, MPI_MAX, 0, comm); if (my_id ==0) { global_min_e = - gather_buff[0]; global_max_e = gather_buff[1]; global_min_rsum = - gather_buff[2]; global_max_rsum = gather_buff[3]; #ifdef HYPRE_LONG_LONG printf( "%2d %12lld %8.0f %0.3f %4d %4d", level, fine_size, global_nonzeros, sparse, global_min_e, global_max_e); #else printf( "%2d %7d %8.0f %0.3f %4d %4d", level, fine_size, global_nonzeros, sparse, global_min_e, global_max_e); #endif printf(" %4.1f %10.3e %10.3e\n", avg_entries, global_min_rsum, global_max_rsum); } #else send_buff[0] = (double) min_entries; send_buff[1] = (double) max_entries; send_buff[2] = min_rowsum; send_buff[3] = max_rowsum; MPI_Gather(send_buff,4,MPI_DOUBLE,gather_buff,4,MPI_DOUBLE,0,comm); if (my_id == 0) { global_min_e = 1000000; global_max_e = 0; global_min_rsum = 1.0e7; global_max_rsum = 0.0; for (j = 0; j < num_procs; j++) { numrows = row_starts[j+1]-row_starts[j]; if (numrows) { global_min_e = hypre_min(global_min_e, (int) gather_buff[j*4]); global_min_rsum = hypre_min(global_min_rsum, gather_buff[j*4 +2]); } global_max_e = hypre_max(global_max_e, (int) gather_buff[j*4 +1]); global_max_rsum = hypre_max(global_max_rsum, gather_buff[j*4 +3]); } #ifdef HYPRE_LONG_LONG printf( "%2d %12lld %8.0f %0.3f %4d %4d", level, fine_size, global_nonzeros, sparse, global_min_e, global_max_e); #else printf( "%2d %7d %8.0f %0.3f %4d %4d", level, fine_size, global_nonzeros, sparse, global_min_e, global_max_e); #endif printf(" %4.1f %10.3e %10.3e\n", avg_entries, global_min_rsum, global_max_rsum); } #endif } if (my_id == 0) { { printf( "\n\nInterpolation Matrix Information:\n\n"); } #if HYPRE_LONG_LONG printf(" entries/row min max"); printf(" row sums\n"); printf("lev rows x cols min max "); printf(" weight weight min max \n"); printf("======================================="); printf("======================================\n"); #else printf(" entries/row min max"); printf(" row sums\n"); printf("lev rows cols min max "); printf(" weight weight min max \n"); printf("======================================="); printf("==========================\n"); #endif } /*----------------------------------------------------- * Enter Statistics Loop *-----------------------------------------------------*/ for (level = 0; level < num_levels-1; level++) { { P_diag = hypre_ParCSRMatrixDiag(P_array[level]); P_diag_data = hypre_CSRMatrixData(P_diag); P_diag_i = hypre_CSRMatrixI(P_diag); P_offd = hypre_ParCSRMatrixOffd(P_array[level]); P_offd_data = hypre_CSRMatrixData(P_offd); P_offd_i = hypre_CSRMatrixI(P_offd); row_starts = hypre_ParCSRMatrixRowStarts(P_array[level]); fine_size = hypre_ParCSRMatrixGlobalNumRows(P_array[level]); coarse_size = hypre_ParCSRMatrixGlobalNumCols(P_array[level]); global_nonzeros = hypre_ParCSRMatrixNumNonzeros(P_array[level]); min_weight = 1.0; max_weight = 0.0; max_rowsum = 0.0; min_rowsum = 0.0; min_entries = 0; max_entries = 0; if (hypre_CSRMatrixNumRows(P_diag)) { if (hypre_CSRMatrixNumCols(P_diag)) min_weight = P_diag_data[0]; for (j = P_diag_i[0]; j < P_diag_i[1]; j++) { min_weight = hypre_min(min_weight, P_diag_data[j]); if (P_diag_data[j] != 1.0) max_weight = hypre_max(max_weight, P_diag_data[j]); min_rowsum += P_diag_data[j]; } for (j = P_offd_i[0]; j < P_offd_i[1]; j++) { min_weight = hypre_min(min_weight, P_offd_data[j]); if (P_offd_data[j] != 1.0) max_weight = hypre_max(max_weight, P_offd_data[j]); min_rowsum += P_offd_data[j]; } max_rowsum = min_rowsum; min_entries = (P_diag_i[1]-P_diag_i[0])+(P_offd_i[1]-P_offd_i[0]); max_entries = 0; for (j = 0; j < hypre_CSRMatrixNumRows(P_diag); j++) { entries = (P_diag_i[j+1]-P_diag_i[j])+(P_offd_i[j+1]-P_offd_i[j]); min_entries = hypre_min(entries, min_entries); max_entries = hypre_max(entries, max_entries); rowsum = 0.0; for (i = P_diag_i[j]; i < P_diag_i[j+1]; i++) { min_weight = hypre_min(min_weight, P_diag_data[i]); if (P_diag_data[i] != 1.0) max_weight = hypre_max(max_weight, P_diag_data[i]); rowsum += P_diag_data[i]; } for (i = P_offd_i[j]; i < P_offd_i[j+1]; i++) { min_weight = hypre_min(min_weight, P_offd_data[i]); if (P_offd_data[i] != 1.0) max_weight = hypre_max(max_weight, P_offd_data[i]); rowsum += P_offd_data[i]; } min_rowsum = hypre_min(rowsum, min_rowsum); max_rowsum = hypre_max(rowsum, max_rowsum); } } avg_entries = ((double) global_nonzeros) / ((double) fine_size); } #ifdef HYPRE_NO_GLOBAL_PARTITION numrows = (int)(row_starts[1]-row_starts[0]); if (!numrows) /* if we don't have any rows, then don't have this count toward min row sum or min num entries */ { min_entries = 1000000; min_rowsum = 1.0e7; min_weight = 1.0e7; } send_buff[0] = - (double) min_entries; send_buff[1] = (double) max_entries; send_buff[2] = - min_rowsum; send_buff[3] = max_rowsum; send_buff[4] = - min_weight; send_buff[5] = max_weight; MPI_Reduce(send_buff, gather_buff, 6, MPI_DOUBLE, MPI_MAX, 0, comm); if (my_id == 0) { global_min_e = - gather_buff[0]; global_max_e = gather_buff[1]; global_min_rsum = -gather_buff[2]; global_max_rsum = gather_buff[3]; global_min_wt = -gather_buff[4]; global_max_wt = gather_buff[5]; #ifdef HYPRE_LONG_LONG printf( "%2d %12lld x %-12lld %3d %3d", level, fine_size, coarse_size, global_min_e, global_max_e); #else printf( "%2d %5d x %-5d %3d %3d", level, fine_size, coarse_size, global_min_e, global_max_e); #endif printf(" %10.3e %9.3e %9.3e %9.3e\n", global_min_wt, global_max_wt, global_min_rsum, global_max_rsum); } #else send_buff[0] = (double) min_entries; send_buff[1] = (double) max_entries; send_buff[2] = min_rowsum; send_buff[3] = max_rowsum; send_buff[4] = min_weight; send_buff[5] = max_weight; MPI_Gather(send_buff,6,MPI_DOUBLE,gather_buff,6,MPI_DOUBLE,0,comm); if (my_id == 0) { global_min_e = 1000000; global_max_e = 0; global_min_rsum = 1.0e7; global_max_rsum = 0.0; global_min_wt = 1.0e7; global_max_wt = 0.0; for (j = 0; j < num_procs; j++) { numrows = row_starts[j+1] - row_starts[j]; if (numrows) { global_min_e = hypre_min(global_min_e, (int) gather_buff[j*6]); global_min_rsum = hypre_min(global_min_rsum, gather_buff[j*6+2]); global_min_wt = hypre_min(global_min_wt, gather_buff[j*6+4]); } global_max_e = hypre_max(global_max_e, (int) gather_buff[j*6+1]); global_max_rsum = hypre_max(global_max_rsum, gather_buff[j*6+3]); global_max_wt = hypre_max(global_max_wt, gather_buff[j*6+5]); } #ifdef HYPRE_LONG_LONG printf( "%2d %12lld x %-12lld %3d %3d", level, fine_size, coarse_size, global_min_e, global_max_e); #else printf( "%2d %5d x %-5d %3d %3d", level, fine_size, coarse_size, global_min_e, global_max_e); #endif printf(" %10.3e %9.3e %9.3e %9.3e\n", global_min_wt, global_max_wt, global_min_rsum, global_max_rsum); } #endif } total_variables = 0; operat_cmplxty = 0; for (j=0;j<hypre_ParAMGDataNumLevels(amg_data);j++) { operat_cmplxty += num_coeffs[j] / num_coeffs[0]; total_variables += num_variables[j]; } if (num_variables[0] != 0) grid_cmplxty = total_variables / num_variables[0]; if (my_id == 0 ) { printf("\n\n Complexity: grid = %f\n",grid_cmplxty); printf(" operator = %f\n",operat_cmplxty); } if (my_id == 0) printf("\n\n"); if (my_id == 0) { printf("\n\nBoomerAMG SOLVER PARAMETERS:\n\n"); printf( " Maximum number of cycles: %d \n",max_iter); printf( " Stopping Tolerance: %e \n",tol); printf( " Cycle type (1 = V, 2 = W, etc.): %d\n\n", cycle_type); printf( " Relaxation Parameters:\n"); printf( " Visiting Grid: down up coarse\n"); printf( " Number of partial sweeps: %4d %2d %4d \n", num_grid_sweeps[1], num_grid_sweeps[2],num_grid_sweeps[3]); printf( " Type 0=Jac, 3=hGS, 6=hSGS, 9=GE: %4d %2d %4d \n", grid_relax_type[1], grid_relax_type[2],grid_relax_type[3]); #if 1 /* TO DO: may not want this to print if CG in the coarse grid */ printf( " Point types, partial sweeps (1=C, -1=F):\n"); if (grid_relax_points) { printf( " Pre-CG relaxation (down):"); for (j = 0; j < num_grid_sweeps[1]; j++) printf(" %2d", grid_relax_points[1][j]); printf( "\n"); printf( " Post-CG relaxation (up):"); for (j = 0; j < num_grid_sweeps[2]; j++) printf(" %2d", grid_relax_points[2][j]); printf( "\n"); printf( " Coarsest grid:"); for (j = 0; j < num_grid_sweeps[3]; j++) printf(" %2d", grid_relax_points[3][j]); printf( "\n\n"); } else if (relax_order == 1) { printf( " Pre-CG relaxation (down):"); for (j = 0; j < num_grid_sweeps[1]; j++) printf(" %2d %2d", one, minus_one); printf( "\n"); printf( " Post-CG relaxation (up):"); for (j = 0; j < num_grid_sweeps[2]; j++) printf(" %2d %2d", minus_one, one); printf( "\n"); printf( " Coarsest grid:"); for (j = 0; j < num_grid_sweeps[3]; j++) printf(" %2d", zero); printf( "\n\n"); } else { printf( " Pre-CG relaxation (down):"); for (j = 0; j < num_grid_sweeps[1]; j++) printf(" %2d", zero); printf( "\n"); printf( " Post-CG relaxation (up):"); for (j = 0; j < num_grid_sweeps[2]; j++) printf(" %2d", zero); printf( "\n"); printf( " Coarsest grid:"); for (j = 0; j < num_grid_sweeps[3]; j++) printf(" %2d", zero); printf( "\n\n"); } #endif if (smooth_type == 6) for (j=0; j < smooth_num_levels; j++) printf( " Schwarz Relaxation Weight %f level %d\n", hypre_ParAMGDataSchwarzRlxWeight(amg_data),j); for (j=0; j < num_levels; j++) if (relax_weight[j] != 1) printf( " Relaxation Weight %f level %d\n",relax_weight[j],j); for (j=0; j < num_levels; j++) if (omega[j] != 1) printf( " Outer relaxation weight %f level %d\n",omega[j],j); } /*if (seq_cg) { hypre_seqAMGSetupStats(amg_data,num_coeffs[0],num_variables[0], operat_cmplxty, grid_cmplxty ); }*/ hypre_TFree(num_coeffs); hypre_TFree(num_variables); hypre_TFree(send_buff); hypre_TFree(gather_buff); return(0); }
int main(int argc, char *argv[]) { int p, my_rank; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &p); int n, local_n; double *A, *A_root, *x, *C, *C_root; //Allocating x on all the processors x=(double *)malloc(SIZE*sizeof(double)); if (my_rank==0) { //Scanning the matrix A and allocating the memory only on the master processor A_root=(double *)malloc(SIZE*SIZE*sizeof(double)); C_root=(double *)malloc(SIZE*sizeof(double)); for (int i = 0; i < SIZE*SIZE; i++) { scanf("%lf",&A_root[i]); } for (int i = 0; i < SIZE; i++) { scanf("%lf",&x[i]); } } MPI_Barrier(MPI_COMM_WORLD); MPI_Bcast(x, SIZE, MPI_DOUBLE,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); local_n=SIZE/p; A=(double *)malloc(SIZE*local_n*sizeof(double)); C=(double *)malloc(SIZE*sizeof(double)); //Scattering the matrix to different processors MPI_Scatter(A_root, SIZE*local_n, MPI_DOUBLE, A, SIZE*local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD); for (int i = 0; i < SIZE/p; i++) { C[i]=0; for (int j = 0; j < local_n; j++) { C[i]+=A[i*SIZE+j]*x[j]; } } //Finally gathering all the elements on the master thread MPI_Gather(C,SIZE,MPI_DOUBLE,C_root,SIZE,MPI_DOUBLE,0,MPI_COMM_WORLD); if (my_rank==0) { for (int i = 0; i < SIZE; i++) { printf("%lf\n", C_root[i]); } } MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { MPI_Init(&argc, &argv); int POPSIZE = atoi(argv[1]); int GENERATION = atoi(argv[2]); int NUM_GAMES = atoi(argv[3]); float CROSSOVER = atof(argv[4]); float MUTATION = atof(argv[5]); int i,j,k,q,s,b2d,count; int world_rank,world_size; unsigned int temp[2], temp2[2]; float RANDOM2 = drand48(); srand48(time(NULL)); pop *player = NULL; pop p[2]; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); /*------------------------MPI_STRUCT----------------------------------*/ MPI_Datatype mpi_pop; MPI_Datatype types[3] = {MPI_UNSIGNED,MPI_UNSIGNED,MPI_UNSIGNED}; int block[3] = {4,1,1}; MPI_Aint offset[3] = {offsetof(pop,history),offsetof(pop,fitness),offsetof(pop,move)}; MPI_Type_create_struct(3,block,offset,types,&mpi_pop); MPI_Type_commit(&mpi_pop); if (world_rank == 0) { if (world_size > POPSIZE/4) { printf("Too many processes for Population Size.\n Please input an even Population Size.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } } /*---------------Allocate the memory for the players------------------*/ if (world_rank == 0) { player = malloc(POPSIZE*sizeof(pop)); for (i=0; i<POPSIZE; i++) { for(j=0; j<4; j++) { player[i].history[j] = lrand48() % 2; player[i].fitness = 0; } } printf("Processor %d has data:\n", world_rank); for (i=0; i<POPSIZE; i++) { for (j=0; j<4; j++) { printf("%d ", player[i].history[j]); } printf("\n"); } } int ARRAY_SIZE = POPSIZE/4; pop *sub_arrays = NULL; if (world_rank != 0) { sub_arrays = malloc(ARRAY_SIZE*sizeof(pop)); } /*-----------------------RUN THE ALGORITHM---------------------------*/ for (k=0; k<GENERATION; k++) { MPI_Scatter(player, ARRAY_SIZE, mpi_pop, sub_arrays, ARRAY_SIZE, mpi_pop, 0, MPI_COMM_WORLD); if (world_rank != 0) { for (i=0; i<ARRAY_SIZE; i++) { for (j=ARRAY_SIZE; j>=0; j--) { p[0] = player[i]; p[1] = player[j]; for(q=0; q<NUM_GAMES; q++) { b2d = ((p[0].history[0]*8) + (p[0].history[1]*4) + (p[0].history[2]*2) + p[0].history[3]); b2d = ((p[1].history[0]*8) + (p[1].history[1]*4) + (p[1].history[2]*2) + p[1].history[3]); Strategy(p[0], b2d); Strategy(p[1], b2d); Fitness(p); for (s=4; s>0; s--) { p[0].history[s] = p[0].history[s-1]; p[1].history[s] = p[1].history[s-1]; } p[0].history[0] = p[0].move; p[1].history[0] = p[1].move; } player[i] = p[0]; player[j] = p[1]; } } } MPI_Barrier(MPI_COMM_WORLD); MPI_Gather(sub_arrays, ARRAY_SIZE, mpi_pop, player, ARRAY_SIZE, mpi_pop, 0, MPI_COMM_WORLD); /*-----------------------Perform Selection-----------------------------*/ if (world_rank == 0) { for(count=0; count<2; count++) { int sumFitness = 0; for (i=0; i<POPSIZE; i++) { sumFitness += p[i].fitness; int RANDOM = lrand48() % sumFitness; if (sumFitness >= RANDOM) { p[count] = player[i]; } } } /*------------------------Crossover-------------------------------------*/ if (RANDOM2 < CROSSOVER) { temp [0] = p[0].history[2]; temp [1] = p[0].history[3]; temp2[0] = p[1].history[2]; temp2[1] = p[1].history[3]; p[0].history[2] = temp2[0]; p[0].history[3] = temp2[1]; p[1].history[2] = temp[ 0]; p[1].history[3] = temp[ 1]; } /*---------------------Mutate Players------------------------------------*/ if (RANDOM2 < MUTATION) { int mp = lrand48() % 4; for (count=0; count<2; count++) { if (p[count].history[mp] == 0) { p[count].history[mp] = 1; } else { p[count].history[mp] = 0; } } } player[lrand48() % POPSIZE] = p[0]; player[lrand48() % POPSIZE] = p[1]; } printf("Processor %d has data:\n", world_rank); for (i=0; i<POPSIZE; i++) { for (j=0; j<4; j++) { printf("%d ", player[i].history[j]); } printf("\n"); } } for(i=0; i<POPSIZE; i++) { for (j=0; j<4; j++) { printf("%d", player[i].history[j]); } printf("\n"); printf("Fitness: %d\n", player[i].fitness); } if (world_rank == 0) free(player); if (world_rank != 0) free(sub_arrays); MPI_Finalize(); return 0; }
void GaussianMean1DRegressionCompute(const QUESO::BaseEnvironment& env, double priorMean, double priorVar, const likelihoodData& dat) { // parameter space: 1-D on (-infinity, infinity) QUESO::VectorSpace<P_V, P_M> paramSpace( env, // queso environment "param_", // name prefix 1, // dimensions NULL); // names P_V paramMin(paramSpace.zeroVector()); P_V paramMax(paramSpace.zeroVector()); paramMin[0] = -INFINITY; paramMax[0] = INFINITY; QUESO::BoxSubset<P_V, P_M> paramDomain( "paramBox_", // name prefix paramSpace, // vector space paramMin, // min values paramMax); // max values // gaussian prior with user supplied mean and variance P_V priorMeanVec(paramSpace.zeroVector()); P_V priorVarVec(paramSpace.zeroVector()); priorMeanVec[0] = priorMean; priorVarVec[0] = priorVar; QUESO::GaussianVectorRV<P_V, P_M> priorRv("prior_", paramDomain, priorMeanVec, priorVarVec); // likelihood is important QUESO::GenericScalarFunction<P_V, P_M> likelihoodFunctionObj( "like_", // name prefix paramDomain, // image set LikelihoodFunc<P_V, P_M>, // routine (void *) &dat, // routine data ptr true); // routineIsForLn QUESO::GenericVectorRV<P_V, P_M> postRv( "post_", // name prefix paramSpace); // image set // Initialize and solve the Inverse Problem with Bayes multi-level sampling QUESO::StatisticalInverseProblem<P_V, P_M> invProb( "", // name prefix NULL, // alt options priorRv, // prior RV likelihoodFunctionObj, // likelihood fcn postRv); // posterior RV invProb.solveWithBayesMLSampling(); // compute mean and second moment of samples on each proc via Knuth online mean/variance algorithm int N = invProb.postRv().realizer().subPeriod(); double subMean = 0.0; double subM2 = 0.0; double delta; P_V sample(paramSpace.zeroVector()); for (int n = 1; n <= N; n++) { invProb.postRv().realizer().realization(sample); delta = sample[0] - subMean; subMean += delta / n; subM2 += delta * (sample[0] - subMean); } // gather all Ns, means, and M2s to proc 0 std::vector<int> unifiedNs(env.inter0Comm().NumProc()); std::vector<double> unifiedMeans(env.inter0Comm().NumProc()); std::vector<double> unifiedM2s(env.inter0Comm().NumProc()); MPI_Gather(&N, 1, MPI_INT, &(unifiedNs[0]), 1, MPI_INT, 0, env.inter0Comm().Comm()); MPI_Gather(&subMean, 1, MPI_DOUBLE, &(unifiedMeans[0]), 1, MPI_DOUBLE, 0, env.inter0Comm().Comm()); MPI_Gather(&subM2, 1, MPI_DOUBLE, &(unifiedM2s[0]), 1, MPI_DOUBLE, 0, env.inter0Comm().Comm()); // get the total number of likelihood calls at proc 0 unsigned long totalLikelihoodCalls = 0; MPI_Reduce(&likelihoodCalls, &totalLikelihoodCalls, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, env.inter0Comm().Comm()); // compute global posterior mean and std via Chan algorithm, output results on proc 0 if (env.inter0Rank() == 0) { int postN = unifiedNs[0]; double postMean = unifiedMeans[0]; double postVar = unifiedM2s[0]; for (unsigned int i = 1; i < unifiedNs.size(); i++) { delta = unifiedMeans[i] - postMean; postMean = (postN * postMean + unifiedNs[i] * unifiedMeans[i]) / (postN + unifiedNs[i]); postVar += unifiedM2s[i] + delta * delta * (((double)postN * unifiedNs[i]) / (postN + unifiedNs[i])); postN += unifiedNs[i]; } postVar /= postN; //compute exact answer - available in this case since the exact posterior is a gaussian N = dat.dataSet.size(); double dataSum = 0.0; for (int i = 0; i < N; i++) dataSum += dat.dataSet[i]; double datMean = dataSum / N; double postMeanExact = (N * priorVar / (N * priorVar + dat.samplingVar)) * datMean + (dat.samplingVar / (N * priorVar + dat.samplingVar)) * priorMean; double postVarExact = 1.0 / (N / dat.samplingVar + 1.0 / priorVar); std::cout << "Number of posterior samples: " << postN << std::endl; std::cout << "Estimated posterior mean: " << postMean << " +/- " << std::sqrt(postVar) << std::endl; std::cout << "Likelihood function calls: " << totalLikelihoodCalls << std::endl; std::cout << "\nExact posterior: Gaussian with mean " << postMeanExact << ", standard deviation " << std::sqrt(postVarExact) << std::endl; } }
int main (int argc, char *argv[]) { int err; double time, time_limit, time_maxMsg; int iter, iter_limit; size_t size, messStart, messStop, mem_limit; int testFlags, ndims, partsize; int k; char hostname[256]; char* hostnames; int root = 0; struct argList args; /* process the command-line arguments, printing usage info on error */ if (!processArgs(argc, argv, &args)) { usage(); } iter = args.iters; messStart = args.messStart; messStop = args.messStop; mem_limit = args.memLimit; time_limit = args.timeLimit; testFlags = args.testFlags; check_buffers = args.checkBuffers; ndims = args.ndims; partsize = args.partSize; /* initialize MPI */ err = MPI_Init(&argc, &argv); if (err) { printf("Error in MPI_Init\n"); exit(1); } /* determine who we are in the MPI world */ MPI_Comm_rank(MPI_COMM_WORLD, &rank_local); MPI_Comm_size(MPI_COMM_WORLD, &rank_count); #ifdef PRINT_ENV /* Print environment as part of Sequoia SOW MPI requirements */ extern void printEnv(void); if (rank_local == 0) { printEnv(); } #endif /* mark start of mpiBench output */ if (rank_local == 0) { printf("START mpiBench_Bcast v%s\n", VERS); } /* collect hostnames of all the processes and print rank layout */ gethostname(hostname, sizeof(hostname)); hostnames = (char*) _ALLOC_MAIN_(sizeof(hostname)*rank_count, "Hostname array"); MPI_Gather(hostname, sizeof(hostname), MPI_CHAR, hostnames, sizeof(hostname), MPI_CHAR, 0, MPI_COMM_WORLD); if (rank_local == 0) { for(k=0; k<rank_count; k++) { printf("%d : %s\n", k, &hostnames[k*sizeof(hostname)]); } } /* allocate message buffers and initailize timing functions */ while(messStop*((size_t)rank_count)*2 > mem_limit && messStop > 0) messStop /= 2; buffer_size = messStop * rank_count; sbuffer = (char*) _ALLOC_MAIN_(messStop * rank_count, "Send Buffer"); rbuffer = (char*) _ALLOC_MAIN_(messStop * rank_count, "Receive Buffer"); sendcounts = (int*) _ALLOC_MAIN_(sizeof(int) * rank_count, "Send Counts"); sdispls = (int*) _ALLOC_MAIN_(sizeof(int) * rank_count, "Send Displacements"); recvcounts = (int*) _ALLOC_MAIN_(sizeof(int) * rank_count, "Recv Counts"); rdispls = (int*) _ALLOC_MAIN_(sizeof(int) * rank_count, "Recv Displacements"); /*time_maxMsg = 2*time_limit; */ time_maxMsg = 0.0; /* if partsize was specified, calculate the number of partions we need */ int partitions = 0; if (partsize > 0) { /* keep dividing comm in half until we get to partsize */ int currentsize = rank_count; while (currentsize >= partsize) { partitions++; currentsize >>= 1; } }
mpi_filebuf * mpi_filebuf::flush() { const double start_time = mpi_wall_time(); int result = -1 ; // Failure return value if ( nullptr != comm_buffer && comm_output ) { // Open for write int err = 0 ; result = 0 ; // Determine the local length: char * cur_buf = comm_buffer ; unsigned int cur_len = pptr() - cur_buf ; // Determine the global lengths char * recv_buf = nullptr ; int * recv_len = nullptr ; int * recv_disp = nullptr ; int nproc = 1 ; // if ( nullptr != comm_root_fp ) { // It should not be neccessary to allocate recv_len on non-root // nodes, but the MPI_Gatherv on Janus always accesses recv_len // even on non-root processors which causes a segmentaion // violation if recv_len is set to nullptr. if ( MPI_SUCCESS != ( err = MPI_Comm_size(comm,&nproc) ) ) MPI_Abort( comm , err ); recv_len = static_cast<int*>(std::malloc( sizeof(int) * nproc )); if ( nullptr == recv_len ) MPI_Abort( comm , MPI_ERR_UNKNOWN ); for (int j = 0 ; j < nproc ; ++j ) recv_len[j] = 0; // } // Gather buffer lengths on the root processor if ( MPI_SUCCESS != ( err = MPI_Gather(&cur_len,1,MPI_INT,recv_len,1,MPI_INT,comm_root,comm))) MPI_Abort( comm , err ); // Root processor must allocate enough buffer space: if ( nullptr != comm_root_fp ) { recv_len[ comm_root ] = 0 ; // Don't send to self if ( nullptr == ( recv_disp = static_cast<int*>(std::malloc( sizeof(int) * (nproc + 1) )) ) ) result = -1 ; if ( 0 == result ) { // Allocation succeeded recv_disp[0] = 0 ; for (int i = 0 ; i < nproc ; ++i ) recv_disp[i+1] = recv_disp[i] + recv_len[i] ; if ( 0 < recv_disp[nproc] ) { if ( nullptr == ( recv_buf = static_cast<char*>(std::malloc( recv_disp[nproc] ) ) )) result = -1 ; } else { result = 1 ; // No need to gather! } if ( -1 != result ) { // Write the root processor's buffer if ( 0 < cur_len ) { if ( std::fwrite(cur_buf,1,cur_len,comm_root_fp) != cur_len ) result = -1 ; // Write failed cur_len = 0 ; // Wrote this buffer } } } std::fflush( comm_root_fp ); } // Root process broadcasts that all is well with the allocation if ( MPI_SUCCESS != ( err = MPI_Bcast(&result,1,MPI_INT,comm_root,comm))) MPI_Abort( comm , err ); if ( 0 == result ) { // All-is-well, need to gather and write // Gather the buffers to the root processor if ( MPI_SUCCESS != ( err = MPI_Gatherv(cur_buf, cur_len, MPI_BYTE, recv_buf, recv_len, recv_disp, MPI_BYTE, comm_root, comm ) ) ) MPI_Abort( comm , err ); // Output the buffers, beginning with 'comm_root' if ( nullptr != comm_root_fp ) { for (int i = 1 ; i < nproc && 0 == result ; ++i ) { const int j = ( i + comm_root ) % nproc ; const unsigned int len = recv_len[j] ; if ( 0 < len ) if ( std::fwrite(recv_buf+recv_disp[j],1,len,comm_root_fp) != len ) result = -1 ; // Write failed } std::fflush( comm_root_fp ); } // Broadcast that the write succeeded if ( MPI_SUCCESS != ( err = MPI_Bcast(&result,1,MPI_INT,comm_root,comm))) MPI_Abort( comm , err ); } else if ( 1 == result ) { // Did not need to gather result = 0 ; } // Reset the output buffer setp( comm_buffer , epptr() ); // Clean up allocated memory if ( nullptr != recv_buf ) std::free( recv_buf ); if ( nullptr != recv_len ) std::free( recv_len ); if ( nullptr != recv_disp ) std::free( recv_disp ); } comm_time += mpi_wall_time() - start_time ; return -1 == result ? nullptr : this ; }
int main(int argc, char* argv[]){ int rank, size, n, i, j, elementiXproc, stage, length, next; orderedAfterSwap *m; char *binary; FILE *file; float *elementi, *mieiElementi, *result; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if(argc<2) { printf("Numero argomenti non sufficiente: %d richiesto %d", argc-1, 1); MPI_Abort(MPI_COMM_WORLD, 0); return 1; } if(rank==0) { writeFile(); file = fopen(argv[1],"rb"); if(file==NULL) { printf("Non è stato possibile aprire il file: %s", argv[1]); MPI_Abort(MPI_COMM_WORLD, 0); return 1; } fread(&n, sizeof(int), 1, file); elementiXproc = n/size; mieiElementi = malloc(sizeof(float)*elementiXproc); elementi = malloc(sizeof(float)*elementiXproc); fread(mieiElementi, sizeof(float), elementiXproc, file); for(i=1; i<size; i++){ MPI_Send (&elementiXproc, 1, MPI_INT, i, 0, MPI_COMM_WORLD); fread(elementi, sizeof(float), elementiXproc, file); MPI_Send (elementi, elementiXproc, MPI_FLOAT, i, 0, MPI_COMM_WORLD); } fclose(file); result = malloc(sizeof(float)*n); } else { MPI_Recv (&elementiXproc, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); mieiElementi = malloc(sizeof(float)*elementiXproc); MPI_Recv (mieiElementi, elementiXproc, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); elementi = malloc(sizeof(float)*elementiXproc); } qsort(mieiElementi, elementiXproc, sizeof(float), floatcomp); length = log(size)/log(2); binary = intToBinary(rank,length); for(stage=0; stage<length; stage++) { if(binary[stage]=='0'){ binary[stage] = '1'; next = binaryToInt(binary, length); binary[stage] = '0'; MPI_Send (mieiElementi, elementiXproc, MPI_FLOAT, next, 0, MPI_COMM_WORLD); MPI_Recv (elementi, elementiXproc, MPI_FLOAT, next, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); m = swapMin(mieiElementi,elementi,elementiXproc); mieiElementi = m->mieiElementi; } else { binary[stage] = '0'; next = binaryToInt(binary, length); binary[stage] = '1'; MPI_Recv (elementi, elementiXproc, MPI_FLOAT, next, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Send (mieiElementi, elementiXproc, MPI_FLOAT, next, 0, MPI_COMM_WORLD); m = swapMax(mieiElementi,elementi,elementiXproc); mieiElementi = m->mieiElementi; } } MPI_Gather(mieiElementi, elementiXproc, MPI_FLOAT, result, elementiXproc, MPI_FLOAT, 0, MPI_COMM_WORLD); if(rank==0){ printf("[ "); for(j=0; j<n; j++) { printf("%f ", result[j]); } printf("] \n"); free(result); } free(m); free(binary); free(mieiElementi); free(elementi); MPI_Finalize(); return 0; }
void saveParticle_HDF(Domain D,int iteration,int s,double minPx) { int i,j,k,istart,iend,jstart,jend,kstart,kend; int nxSub,nySub,nzSub,cnt,totalCnt,start,index; int minXSub,minYSub,minZSub; double dx,dy,dz,lambda,tmpDouble; char name[100]; double *saveDouble; int *saveInt,offset[2]; Particle ***particle; particle=D.particle; ptclList *p; LoadList *LL; int myrank, nTasks; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nTasks); int recv[nTasks]; void saveParticleComp_Double(); void saveParticleComp_Int(); hid_t file_id; herr_t status; istart=D.istart; iend=D.iend; jstart=D.jstart; jend=D.jend; kstart=D.kstart; kend=D.kend; nxSub=D.nxSub; nySub=D.nySub; nzSub=D.nzSub; dx=D.dx; dy=D.dy; dz=D.dz; lambda=D.lambda; minXSub=D.minXSub; minYSub=D.minYSub; minZSub=D.minZSub; sprintf(name,"%dParticle%d.h5",s,iteration); // plist_id=H5Pcreate(H5P_FILE_ACCESS); // H5Pset_fapl_mpio(plist_id,MPI_COMM_WORLD,MPI_INFO_NULL); // H5Pset_fclose_degree(plist_id,H5F_CLOSE_SEMI); if(myrank==0) { file_id=H5Fcreate(name,H5F_ACC_TRUNC,H5P_DEFAULT,H5P_DEFAULT); H5Fclose(file_id); } else ; MPI_Barrier(MPI_COMM_WORLD); switch(D.dimension) { //2D case 2: k=0; cnt=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) cnt++; else ; p=p->next; } } saveDouble = (double *)malloc(cnt*sizeof(double )); saveInt = (int *)malloc(cnt*sizeof(int )); MPI_Gather(&cnt,1,MPI_INT,recv,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(recv,nTasks,MPI_INT,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); start=0; for(i=0; i<myrank; i++) start+=recv[i]; totalCnt=0; for(i=0; i<nTasks; i++) totalCnt+=recv[i]; if(myrank==0) saveIntMeta(name,"totalCnt",&totalCnt); else ; MPI_Barrier(MPI_COMM_WORLD); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { tmpDouble=((i-istart+minXSub)+p->x)*dx*lambda; saveDouble[index]=tmpDouble; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"x",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { tmpDouble=((j-jstart+minYSub)+p->y)*dy*lambda; saveDouble[index]=tmpDouble; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"y",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveDouble[index]=p->p1; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"px",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveDouble[index]=p->p2; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"py",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveDouble[index]=p->p3; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"pz",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveInt[index]=p->index; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Int(saveInt,name,"index",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveInt[index]=p->core; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Int(saveInt,name,"core",totalCnt,cnt,start); free(saveDouble); free(saveInt); break; //3D case 3: cnt=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) cnt++; else ; p=p->next; } } saveDouble = (double *)malloc(cnt*sizeof(double )); saveInt = (int *)malloc(cnt*sizeof(int )); MPI_Gather(&cnt,1,MPI_INT,recv,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(recv,nTasks,MPI_INT,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); start=0; for(i=0; i<myrank; i++) start+=recv[i]; totalCnt=0; for(i=0; i<nTasks; i++) totalCnt+=recv[i]; if(myrank==0) saveIntMeta(name,"totalCnt",&totalCnt); else ; MPI_Barrier(MPI_COMM_WORLD); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { tmpDouble=((i-istart+minXSub)+p->x)*dx*lambda; saveDouble[index]=tmpDouble; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"x",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { tmpDouble=((j-jstart+minYSub)+p->y)*dy*lambda; saveDouble[index]=tmpDouble; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"y",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { tmpDouble=((k-kstart+minZSub)+p->z)*dz*lambda; saveDouble[index]=tmpDouble; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"z",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveDouble[index]=p->p1; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"px",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveDouble[index]=p->p2; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"py",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveDouble[index]=p->p3; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Double(saveDouble,name,"pz",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveInt[index]=p->index; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Int(saveInt,name,"index",totalCnt,cnt,start); index=0; for(i=istart; i<iend; i++) for(j=jstart; j<jend; j++) for(k=kstart; k<kend; k++) { p=particle[i][j][k].head[s]->pt; while(p) { if(p->p1>=minPx) { saveInt[index]=p->core; index++; } else ; p=p->next; } } MPI_Barrier(MPI_COMM_WORLD); saveParticleComp_Int(saveInt,name,"core",totalCnt,cnt,start); free(saveDouble); free(saveInt); break; } //End of switch(dimension....) }
int main(int argc, char * argv[]) { int rank, np; int * D; int * a; int i; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); int res=-1; int * results; srand(rank + time(0)); for(i = 20; i<100; i+=2) { // the matrix that contains the compatatibilies D = (int*) malloc( sizeof(int)*i*i ); // the array that contains a solution a = (int*) malloc( sizeof(int)*i ); initArray(a, -1, i); if(rank==0) { //initialize the matrix genMatrix(D, i); // allocate the array to receive the gold results = (int*) malloc( sizeof(int)*np ); } // generate a solution genSolution(a, i); //send compatibily matrix and initial solution to other processes MPI_Bcast(D, sizeof(int)*i*i, MPI_BYTE, 0, MPI_COMM_WORLD); //MPI_Bcast(a, sizeof(int)*i, MPI_BYTE, 0, MPI_COMM_WORLD); res = alg2(i, D, a, rank); //MPI_Barrier(MPI_COMM_WORLD); MPI_Gather(&res, 1, MPI_INT, results, 1, MPI_INT, 0, MPI_COMM_WORLD); if(rank==0) { printf("%d\t%d\n", i, getMin(results, np) ); // clean free(results); } free(D); free(a); } MPI_Finalize(); return 0; }
//---------------------------------------------------------------------------// int gatherv(Node &send_node, Node &recv_node, int root, MPI_Comm mpi_comm) { Node n_snd_compact; send_node.compact_to(n_snd_compact); int m_size = mpi::size(mpi_comm); int m_rank = mpi::rank(mpi_comm); std::string schema_str = n_snd_compact.schema().to_json(); int schema_len = schema_str.length() + 1; int data_len = n_snd_compact.total_bytes(); // to do the conduit gatherv, first need a gather to get the // schema and data buffer sizes int snd_sizes[] = {schema_len, data_len}; Node n_rcv_sizes; if( m_rank == root ) { Schema s; s["schema_len"].set(DataType::c_int()); s["data_len"].set(DataType::c_int()); n_rcv_sizes.list_of(s,m_size); } int mpi_error = MPI_Gather( snd_sizes, // local data 2, // two ints per rank MPI_INT, // send ints n_rcv_sizes.data_ptr(), // rcv buffer 2, // two ints per rank MPI_INT, // rcv ints root, // id of root for gather op mpi_comm); // mpi com CONDUIT_CHECK_MPI_ERROR(mpi_error); Node n_rcv_tmp; int *schema_rcv_counts = NULL; int *schema_rcv_displs = NULL; char *schema_rcv_buff = NULL; int *data_rcv_counts = NULL; int *data_rcv_displs = NULL; char *data_rcv_buff = NULL; // we only need rcv params on the gather root if( m_rank == root ) { // alloc data for the mpi gather counts and displ arrays n_rcv_tmp["schemas/counts"].set(DataType::c_int(m_size)); n_rcv_tmp["schemas/displs"].set(DataType::c_int(m_size)); n_rcv_tmp["data/counts"].set(DataType::c_int(m_size)); n_rcv_tmp["data/displs"].set(DataType::c_int(m_size)); // get pointers to counts and displs schema_rcv_counts = n_rcv_tmp["schemas/counts"].value(); schema_rcv_displs = n_rcv_tmp["schemas/displs"].value(); data_rcv_counts = n_rcv_tmp["data/counts"].value(); data_rcv_displs = n_rcv_tmp["data/displs"].value(); int schema_curr_displ = 0; int data_curr_displ = 0; int i=0; NodeIterator itr = n_rcv_sizes.children(); while(itr.has_next()) { Node &curr = itr.next(); int schema_curr_count = curr["schema_len"].value(); int data_curr_count = curr["data_len"].value(); schema_rcv_counts[i] = schema_curr_count; schema_rcv_displs[i] = schema_curr_displ; schema_curr_displ += schema_curr_count; data_rcv_counts[i] = data_curr_count; data_rcv_displs[i] = data_curr_displ; data_curr_displ += data_curr_count; i++; } n_rcv_tmp["schemas/data"].set(DataType::c_char(schema_curr_displ)); schema_rcv_buff = n_rcv_tmp["schemas/data"].value(); } mpi_error = MPI_Gatherv( const_cast <char*>(schema_str.c_str()), schema_len, MPI_CHAR, schema_rcv_buff, schema_rcv_counts, schema_rcv_displs, MPI_CHAR, root, mpi_comm); CONDUIT_CHECK_MPI_ERROR(mpi_error); // build all schemas from JSON, compact them. Schema rcv_schema; if( m_rank == root ) { //TODO: should we make it easer to create a compact schema? Schema s_tmp; for(int i=0; i < m_size; i++) { Schema &s = s_tmp.append(); s.set(&schema_rcv_buff[schema_rcv_displs[i]]); } s_tmp.compact_to(rcv_schema); } if( m_rank == root ) { // allocate data to hold the gather result recv_node.set(rcv_schema); data_rcv_buff = (char*)recv_node.data_ptr(); } mpi_error = MPI_Gatherv( n_snd_compact.data_ptr(), data_len, MPI_CHAR, data_rcv_buff, data_rcv_counts, data_rcv_displs, MPI_CHAR, root, mpi_comm); CONDUIT_CHECK_MPI_ERROR(mpi_error); return mpi_error; }
int main(int argc, char **argv) { int rank, M, j,i, *d_graph; int *local_matrix, *row_matrix, *col_matrix, *res_matrix, *rowIds, *colIds; int P, N, q, p_row, p_col; double start, finish; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &P); MPI_Comm_rank(MPI_COMM_WORLD, &rank); //INPUT HANDLED BY THE ROOT PROCESSOR if (rank == ROOT){ scanf("%d", &N); q = check_fox_conditions(P,N); //Check's if the fox's conditions are met if(q == 0){ MPI_Abort(MPI_COMM_WORLD, 0); return 1; //error } d_graph = (int*)malloc((N*N) * sizeof(int)); for(i=0; i < N; i++){ for(j=0; j < N; j++){ scanf("%d", &d_graph[GET_MTRX_POS(i,j,N)]); if (d_graph[GET_MTRX_POS(i,j,N)] == 0 && i != j) { d_graph[GET_MTRX_POS(i,j,N)] = INF; } } } MPI_Bcast(&q, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); if(q > 1) divide_matrix( d_graph, N, q); } else{ MPI_Bcast(&q, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); } //---------------COMMON------------------ int lngth = N / q; local_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); row_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); col_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); res_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); if(q>1) chnkd_MPI_Recv(local_matrix, lngth*lngth, MPI_INT, 0); else local_matrix = d_graph; p_row = ( rank / q ); p_col = ( rank % q ); //CREATE COMMUNICATORS MPI_Group MPI_GROUP_WORLD; MPI_Comm_group(MPI_COMM_WORLD, &MPI_GROUP_WORLD); MPI_Group row_group, col_group; MPI_Comm row_comm, col_comm, grid_comm; int tmp_row, tmp_col, proc; int row_process_ranks[q], col_process_ranks[q]; for(proc = 0; proc < q; proc++){ row_process_ranks[proc] = (p_row * q) + proc; col_process_ranks[proc] = ((p_col + proc*q) %(q*q)); } radixsort(col_process_ranks, q); radixsort(row_process_ranks, q); MPI_Group_incl(MPI_GROUP_WORLD, q, row_process_ranks, &row_group); MPI_Group_incl(MPI_GROUP_WORLD, q, col_process_ranks, &col_group); MPI_Comm_create(MPI_COMM_WORLD, row_group, &row_comm); MPI_Comm_create(MPI_COMM_WORLD, col_group, &col_comm); if ((rank / q) == (rank % q)) { memcpy(row_matrix, local_matrix, (lngth*lngth) * sizeof(int)); } int ln,d,flag; int step, rotation_src, rotation_dest, src; int count = 0; memcpy(res_matrix, local_matrix, (lngth*lngth) * sizeof(int)); rotation_src = (p_row + 1) % q; rotation_dest = ((p_row - 1) + q) % q; ln = (lngth*q) << 1; start = MPI_Wtime(); for (d = 2; d < ln; d = d << 1) { memcpy(col_matrix, local_matrix, (lngth*lngth) * sizeof(int)); for ( step = 0; step < q; step++) { src = (p_row + step) % q; count++; if (src == p_col) { MPI_Bcast(local_matrix, lngth*lngth, MPI_INT, src, row_comm); floyd_warshall( local_matrix, col_matrix, res_matrix, lngth); } else { MPI_Bcast(row_matrix, lngth*lngth, MPI_INT, src, row_comm); floyd_warshall( row_matrix, col_matrix, res_matrix, lngth); } if( step < q-1) MPI_Sendrecv_replace(col_matrix, lngth*lngth, MPI_INT, rotation_dest, STD_TAG,rotation_src, STD_TAG, col_comm, MPI_STATUS_IGNORE); } memcpy(local_matrix, res_matrix, (lngth*lngth) * sizeof(int)); } int *sol; sol = malloc(N*N*sizeof(int)); MPI_Gather(res_matrix, lngth*lngth, MPI_INT, sol, lngth*lngth, MPI_INT, 0, MPI_COMM_WORLD); if (rank == 0) { finish = MPI_Wtime(); printf("Tempo de execução %f\n",finish - start); } if (rank == 0) { int row, col, pos_x, pos_y, pos, tmp_y, tmp_x; for (i = 0; i < P; i++) { pos_x = i / q; pos_y = i % q; pos = i * lngth*lngth; for (row = 0; row < lngth; row++) { for (col = 0; col < lngth; col++) { tmp_x = GET_MTRX_POS(pos_x,row,lngth); tmp_y = GET_MTRX_POS(pos_y,col,lngth); if (sol[GET_MTRX_POS(row,col,lngth) + pos] == INF) d_graph[GET_MTRX_POS(tmp_x,tmp_y,N)] = 0; else d_graph[GET_MTRX_POS(tmp_x,tmp_y,N)] = sol[GET_MTRX_POS(row,col,lngth) + pos]; } } } prints_matrix(d_graph,N); } MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int rank, nprocs, i, *counter_mem, *get_array, *get_idx, *acc_idx, mask, nlevels, level, idx, tmp_rank, pof2; MPI_Datatype get_type, acc_type; MPI_Win win; int errs = 0, *results, *counter_vals; MTest_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&rank); if (rank == 0) { /* allocate counter memory and initialize to 0 */ /* find the next power-of-two >= nprocs */ pof2 = 1; while (pof2 < nprocs) pof2 *= 2; /* counter_mem = (int *) calloc(pof2*2, sizeof(int)); */ i = MPI_Alloc_mem(pof2*2*sizeof(int), MPI_INFO_NULL, &counter_mem); if (i) { printf("Can't allocate memory in test program\n"); MPI_Abort(MPI_COMM_WORLD, 1); } for (i=0; i<(pof2*2); i++) counter_mem[i] = 0; MPI_Win_create(counter_mem, pof2*2*sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win); MPI_Win_free(&win); /* free(counter_mem) */ MPI_Free_mem(counter_mem); /* gather the results from other processes, sort them, and check whether they represent a counter being incremented by 1 */ results = (int *) malloc(NTIMES*nprocs*sizeof(int)); for (i=0; i<NTIMES*nprocs; i++) results[i] = -1; MPI_Gather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, results, NTIMES, MPI_INT, 0, MPI_COMM_WORLD); qsort(results+NTIMES, NTIMES*(nprocs-1), sizeof(int), compar); for (i=NTIMES+1; i<(NTIMES*nprocs); i++) if (results[i] != results[i-1] + 1) errs++; free(results); } else { /* Get the largest power of two smaller than nprocs */ mask = 1; nlevels = 0; while (mask < nprocs) { mask <<= 1; nlevels++; } mask >>= 1; get_array = (int *) malloc(nlevels * sizeof(int)); get_idx = (int *) malloc(nlevels * sizeof(int)); acc_idx = (int *) malloc(nlevels * sizeof(int)); level = 0; idx = 0; tmp_rank = rank; while (mask >= 1) { if (tmp_rank < mask) { /* go to left for acc_idx, go to right for get_idx. set idx=acc_idx for next iteration */ acc_idx[level] = idx + 1; get_idx[level] = idx + mask*2; idx = idx + 1; } else { /* go to right for acc_idx, go to left for get_idx. set idx=acc_idx for next iteration */ acc_idx[level] = idx + mask*2; get_idx[level] = idx + 1; idx = idx + mask*2; } level++; tmp_rank = tmp_rank % mask; mask >>= 1; } /* for (i=0; i<nlevels; i++) printf("Rank %d, acc_idx[%d]=%d, get_idx[%d]=%d\n", rank, i, acc_idx[i], i, get_idx[i]); */ MPI_Type_create_indexed_block(nlevels, 1, get_idx, MPI_INT, &get_type); MPI_Type_create_indexed_block(nlevels, 1, acc_idx, MPI_INT, &acc_type); MPI_Type_commit(&get_type); MPI_Type_commit(&acc_type); /* allocate array to store the values obtained from the fetch-and-add counter */ counter_vals = (int *) malloc(NTIMES * sizeof(int)); MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for (i=0; i<NTIMES; i++) { Get_nextval_tree(win, get_array, get_type, acc_type, nlevels, counter_vals+i); /* printf("Rank %d, counter %d\n", rank, value); */ } MPI_Win_free(&win); free(get_array); free(get_idx); free(acc_idx); MPI_Type_free(&get_type); MPI_Type_free(&acc_type); /* gather the results to the root */ MPI_Gather(counter_vals, NTIMES, MPI_INT, NULL, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD); free(counter_vals); } MTest_Finalize(errs); MPI_Finalize(); return MTestReturnValue( errs ); }
/** * Create a parms_Map object based on the output of ParMetis. * * @param self A parms_Map object created. * @param vtxdist An integer array of size np+1, where np is the * number of PEs. This array indicates the range of * vertices that are local to each processor. PE i * stores vertices in the range of [vtxdist[i], * vtxdist[i+1]). * @param part An array of size equal to the number of * locally-stored vertices. part[j] indicates the ID * of the PE to which the vertex with local index j * and global index vtxdist[pid]+j belongs (pid is ID * of local PE). * @param comm MPI communicator. * @param offset The start index. * - 1 FORTRAN * - 0 C * @param dof The number of variables associated with each * vertex. * @param vtype Assuming the variables u_i, v_i are associated * with vertex i, two styles of numbering variables * are as follows: * - INTERLACED. Variables are numbered in the * order of \f$u_1, v_1, u_2, v_2, \cdots\f$; * - NONINTERLACED. Variables are numbered in the * order of \f$u_1, u_2, u_3,...,v_1, v_2,...\f$. * * @return 0 on success. */ int parms_MapCreateFromDist(parms_Map *self, int *vtxdist, int *part, MPI_Comm comm, int offset, int dof, VARSTYPE vtype) { parms_Map newMap; int npro, pid, i, j, l, gsize; int gv_size, lsize, nl, ind, gindex; int *num, *nums, *num_rcv, *disp, *snd_buf, *rcv_buf; MPI_Comm newComm; MPI_Comm_dup(comm, &newComm); PARMS_NEW0((newMap)); newMap->ref = 1; MPI_Comm_rank(newComm, &newMap->pid); MPI_Comm_size(newComm, &newMap->npro); newMap->comm = newComm; npro = newMap->npro; pid = newMap->pid; /* get the number of local vertices */ nl = vtxdist[pid+1] - vtxdist[pid]; /* calculate the total number of vertices */ gsize = vtxdist[npro] - vtxdist[0]; /* total number of variables */ gv_size = newMap->gsize = gsize * dof; newMap->start = offset; newMap->dof = dof; newMap->vtype = vtype; newMap->isserial = false; if (newMap->npro == 1) { newMap->isserial = true; } newMap->isperm = false; newMap->isvecperm = false; newMap->ispermalloc = false; newMap->isdatalloc = false; if (!newMap->isserial) { PARMS_NEWARRAY(snd_buf, nl); /* create a hash table */ parms_TableCreate(&newMap->table, NULL, nl); PARMS_NEWARRAY0(num, npro); PARMS_NEWARRAY(nums, npro); /* num[i] stores the number of locally-stored variables being distributed to PE i */ for (i = 0; i < nl; i++) { num[part[i]-offset]++; } MPI_Allreduce(num, nums, npro, MPI_INT, MPI_SUM, newComm); /* nums[i] stores the number of variables on PE i */ lsize = newMap->lsize = nums[pid]*dof; PARMS_NEWARRAY(newMap->lvars, lsize); PARMS_FREE(nums); PARMS_NEWARRAY(disp, npro+1); PARMS_NEWARRAY(num_rcv, npro); /* num_rcv stores the number of data received from other processors */ for (i = 0; i < npro; i++) { MPI_Gather(&num[i], 1, MPI_INT, num_rcv, 1, MPI_INT, i, newComm); /* snd_buf stores the data sent to PE i */ ind = 0; for (j = 0; j < nl; j++) { if (part[j]-offset == i) { snd_buf[ind++] = vtxdist[pid] + j - offset; } } if (pid == i) { /* disp is an integer array. disp[i] specifies the displacement relative to rcv_buf at which to place the incoming data from PE i */ disp[0] = 0; for (j = 0; j < npro; j++) { disp[j+1] = disp[j] + num_rcv[j]; } /* variables in rcv_buf are stored in C-style */ PARMS_NEWARRAY(rcv_buf, disp[npro]); MPI_Gatherv(snd_buf, num[i], MPI_INT, newMap->lvars, num_rcv, disp, MPI_INT, i, newComm); if (vtype == INTERLACED) { ind = 0; for (j = 0; j < disp[npro]; j++) { for (l = 0; l < dof; l++) { gindex = dof*rcv_buf[j]+l; parms_TablePut(newMap->table, gindex, ind); newMap->lvars[ind++] = gindex; } } } else if (vtype == NONINTERLACED) { for (j = 0; j < disp[npro]; j++) { parms_TablePut(newMap->table, newMap->lvars[j], j); } ind = disp[npro]; for (j = 0; j < disp[npro]; j++) { for (l = 1; l < dof; l++) { gindex = gsize*l+newMap->lvars[j]; parms_TablePut(newMap->table, gindex, ind); newMap->lvars[ind++] = gindex; } } } PARMS_FREE(rcv_buf); } else { MPI_Gatherv(snd_buf, num[i], MPI_INT, rcv_buf, num_rcv, disp, MPI_INT, i, newComm); } } PARMS_FREE(snd_buf); PARMS_FREE(num); PARMS_FREE(num_rcv); PARMS_FREE(disp); newMap->ispermalloc = true; PARMS_NEWARRAY0(newMap->perm, lsize); PARMS_NEWARRAY0(newMap->iperm, lsize); for (i = 0; i < lsize; i++) { newMap->perm[i] = -1; } } else { lsize = gv_size; newMap->lsize = gv_size; } newMap->nint = lsize; newMap->ninf = 0; newMap->n_ext = 0; *self = newMap; /* Define complex data type for MPI if complex code is compiled */ #if defined(DBL_CMPLX) parms_InitComplex(); #endif return 0; }
int DisplayGoL(int N, int effective_cols_size, int matrix[N][effective_cols_size], int rank) { int realColumnSize = effective_cols_size-2; int arraySize = N * realColumnSize; int tempArray[arraySize]; int count = 0; int r, c; int displaymatrix[N][N]; int tempTempArray[N*N]; int currentGatherTime = 0; struct timeval send1s, send1e; int tSend; //printf("\nEFFECTIVE COL SIXE :%d",effective_cols_size); for(c=1;c<effective_cols_size-1;c++){ for(r=0;r<N;r++){ tempArray[count] = matrix[r][c]; count++; //printf("SETTING RANK:%d, INDEX: %d and %d, VALUE: %d\n", rank, r,c, tempArray[count-1]); } } gettimeofday(&send1s, NULL); if(rank==0) { MPI_Gather(tempArray, N * (realColumnSize), MPI_INT, tempTempArray,N * (realColumnSize), MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Gather(tempArray, N * (realColumnSize), MPI_INT, NULL,0, MPI_INT, 0, MPI_COMM_WORLD); } gettimeofday(&send1e, NULL); currentGatherTime += (send1e.tv_sec-send1s.tv_sec)*1000 + (send1e.tv_usec-send1s.tv_usec)/1000; //printf("%d", currentGatherTime); int q = 0; // for(q=0; q< N*realColumnSize; q++){ // printf("RANK: %d, INDEX: %d, VALUE: %d\n", rank, q, tempArray[q]); // } if(rank==0){ // If the rank is 0 we will need to gather from the array // put it into a matrix and for(c=0;c<N*N;c++){ displaymatrix[c%N][c/N] = tempTempArray[c]; //printf("INDEX 22: %d, VALUE: %d\n", c, tempTempArray[c]); } // printf("\n \n GATHER AT RANK %d\n",rank); for (r = 0; r < N; r++) { for (c = 0; c < N; c++) printf("V_G-%d-%d = %d ",r,c, displaymatrix[r][c]); printf("\n"); } } return currentGatherTime; //return; }
/*************** REQ_GETPARTS ************/ void cuda_mpi_get_particles(CUDA_particle_data *particle_data_host) { int n_part; int g, pnode; Cell *cell; int c; MPI_Status status; int i; int *sizes; sizes = (int*) Utils::malloc(sizeof(int)*n_nodes); n_part = cells_get_n_particles(); /* first collect number of particles on each node */ MPI_Gather(&n_part, 1, MPI_INT, sizes, 1, MPI_INT, 0, comm_cart); /* just check if the number of particles is correct */ if(this_node > 0){ /* call slave functions to provide the slave datas */ cuda_mpi_get_particles_slave(); } else { /* master: fetch particle informations into 'result' */ g = 0; for (pnode = 0; pnode < n_nodes; pnode++) { if (sizes[pnode] > 0) { if (pnode == 0) { for (c = 0; c < local_cells.n; c++) { Particle *part; int npart; int dummy[3] = {0,0,0}; double pos[3]; cell = local_cells.cell[c]; part = cell->part; npart = cell->n; for (i=0;i<npart;i++) { memmove(pos, part[i].r.p, 3*sizeof(double)); fold_position(pos, dummy); particle_data_host[i+g].p[0] = (float)pos[0]; particle_data_host[i+g].p[1] = (float)pos[1]; particle_data_host[i+g].p[2] = (float)pos[2]; particle_data_host[i+g].v[0] = (float)part[i].m.v[0]; particle_data_host[i+g].v[1] = (float)part[i].m.v[1]; particle_data_host[i+g].v[2] = (float)part[i].m.v[2]; #ifdef IMMERSED_BOUNDARY particle_data_host[i+g].isVirtual = part[i].p.isVirtual; #endif #ifdef DIPOLES particle_data_host[i+g].dip[0] = (float)part[i].r.dip[0]; particle_data_host[i+g].dip[1] = (float)part[i].r.dip[1]; particle_data_host[i+g].dip[2] = (float)part[i].r.dip[2]; #endif #ifdef SHANCHEN // SAW TODO: does this really need to be copied every time? int ii; for(ii=0;ii<2*LB_COMPONENTS;ii++){ particle_data_host[i+g].solvation[ii] = (float)part[i].p.solvation[ii]; } #endif #ifdef LB_ELECTROHYDRODYNAMICS particle_data_host[i+g].mu_E[0] = (float)part[i].p.mu_E[0]; particle_data_host[i+g].mu_E[1] = (float)part[i].p.mu_E[1]; particle_data_host[i+g].mu_E[2] = (float)part[i].p.mu_E[2]; #endif #ifdef ELECTROSTATICS particle_data_host[i+g].q = (float)part[i].p.q; #endif #ifdef ROTATION particle_data_host[i+g].quatu[0] = (float)part[i].r.quatu[0]; particle_data_host[i+g].quatu[1] = (float)part[i].r.quatu[1]; particle_data_host[i+g].quatu[2] = (float)part[i].r.quatu[2]; #endif #ifdef ENGINE particle_data_host[i+g].swim.v_swim = (float)part[i].swim.v_swim; particle_data_host[i+g].swim.f_swim = (float)part[i].swim.f_swim; particle_data_host[i+g].swim.quatu[0] = (float)part[i].r.quatu[0]; particle_data_host[i+g].swim.quatu[1] = (float)part[i].r.quatu[1]; particle_data_host[i+g].swim.quatu[2] = (float)part[i].r.quatu[2]; #if defined(LB) || defined(LB_GPU) particle_data_host[i+g].swim.push_pull = part[i].swim.push_pull; particle_data_host[i+g].swim.dipole_length = (float)part[i].swim.dipole_length; #endif particle_data_host[i+g].swim.swimming = part[i].swim.swimming; #endif } g += npart; } } else { MPI_Recv(&particle_data_host[g], sizes[pnode]*sizeof(CUDA_particle_data), MPI_BYTE, pnode, REQ_CUDAGETPARTS, comm_cart, &status); g += sizes[pnode]; } } } } COMM_TRACE(fprintf(stderr, "%d: finished get\n", this_node)); free(sizes); }
void cuda_mpi_send_forces(float *host_forces, float *host_torques, CUDA_fluid_composition * host_composition){ int n_part; int g, pnode; Cell *cell; int c; int i; int *sizes; sizes = (int *) Utils::malloc(sizeof(int)*n_nodes); n_part = cells_get_n_particles(); /* first collect number of particles on each node */ MPI_Gather(&n_part, 1, MPI_INT, sizes, 1, MPI_INT, 0, comm_cart); /* call slave functions to provide the slave data */ if(this_node > 0) { cuda_mpi_send_forces_slave(); } else{ /* fetch particle informations into 'result' */ g = 0; for (pnode = 0; pnode < n_nodes; pnode++) { if (sizes[pnode] > 0) { if (pnode == 0) { for (c = 0; c < local_cells.n; c++) { int npart; cell = local_cells.cell[c]; npart = cell->n; for (i=0;i<npart;i++) { cell->part[i].f.f[0] += (double)host_forces[(i+g)*3+0]; cell->part[i].f.f[1] += (double)host_forces[(i+g)*3+1]; cell->part[i].f.f[2] += (double)host_forces[(i+g)*3+2]; #ifdef ROTATION cell->part[i].f.torque[0] += (double)host_torques[(i+g)*3+0]; cell->part[i].f.torque[1] += (double)host_torques[(i+g)*3+1]; cell->part[i].f.torque[2] += (double)host_torques[(i+g)*3+2]; #endif #ifdef SHANCHEN for (int ii=0;ii<LB_COMPONENTS;ii++) { cell->part[i].r.composition[ii] = (double)host_composition[i+g].weight[ii]; } #endif } g += npart; } } else { /* and send it back to the slave node */ MPI_Send(&host_forces[3*g], 3*sizes[pnode]*sizeof(float), MPI_BYTE, pnode, REQ_CUDAGETFORCES, comm_cart); #ifdef ROTATION MPI_Send(&host_torques[3*g], 3*sizes[pnode]*sizeof(float), MPI_BYTE, pnode, REQ_CUDAGETFORCES, comm_cart); #endif #ifdef SHANCHEN MPI_Send(&host_composition[g], sizes[pnode]*sizeof(CUDA_fluid_composition), MPI_BYTE, pnode, REQ_CUDAGETPARTS, comm_cart); #endif g += sizes[pnode]; } } } } COMM_TRACE(fprintf(stderr, "%d: finished send\n", this_node)); free(sizes); }
int main ( int argc, char *argv[] ) { // Auxiliary variables int rank; int npcs; int step; dmn domain; double wtime; // Solution arrays double *g_u; /* will be allocated in ROOT only */ double *t_u; double *t_un; // Initialize MPI MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &npcs); // Manage Domain sizes domain = Manage_Domain(rank,npcs); // Allocate Memory Manage_Memory(0,domain,&g_u,&t_u,&t_un); // Root mode: Build Initial Condition and scatter it to the rest of processors if (domain.rank==ROOT) Call_IC(2,g_u); MPI_Scatter(g_u, domain.size, MPI_DOUBLE, t_u+NX*NY, domain.size, MPI_DOUBLE, ROOT, MPI_COMM_WORLD); // Exchage Halo regions Manage_Comms(domain,&t_u); MPI_Barrier(MPI_COMM_WORLD); // ROOT mode: Record the starting time. if (rank==ROOT) wtime=MPI_Wtime(); // Asynchronous MPI Solver for (step = 0; step < NO_STEPS; step+=2) { // print iteration in ROOT mode if (rank==ROOT && step%10000==0) printf(" Step %d of %d\n",step,(int)NO_STEPS); // Exchange Boundaries and compute stencil Call_Laplace(domain,&t_u,&t_un); Manage_Comms(domain,&t_un); // 1st iter Call_Laplace(domain,&t_un,&t_u); Manage_Comms(domain,&t_u ); // 2nd iter } MPI_Barrier(MPI_COMM_WORLD); // ROOT mode: Record the final time. if (rank==ROOT) { wtime = MPI_Wtime()-wtime; printf ("\n Wall clock elapsed seconds = %f\n\n", wtime ); } // Gather solutions to ROOT and write solution in ROOT mode MPI_Gather(t_u+NX*NY, domain.size, MPI_DOUBLE, g_u, domain.size, MPI_DOUBLE, ROOT, MPI_COMM_WORLD); if (rank==ROOT) Save_Results(g_u); // Free Memory Manage_Memory(1,domain,&g_u,&t_u,&t_un); MPI_Barrier(MPI_COMM_WORLD); // Terminate MPI. MPI_Finalize(); // ROOT mode: Terminate. if (rank==ROOT) { printf ("HEAT_MPI:\n" ); printf (" Normal end of execution.\n\n" ); } return 0; }