int adios_read_flexpath_advance_step(ADIOS_FILE *adiosfile, int last, float timeout_sec) { flexpath_reader_file *fp = (flexpath_reader_file*)adiosfile->fh; MPI_Barrier(fp->comm); int count = 0; // for perf measurements send_flush_msg(fp, fp->writer_coordinator, STEP, 1); //put this on a timer, so to speak, for timeout_sec while(fp->mystep == fp->last_writer_step){ if(fp->writer_finalized){ adios_errno = err_end_of_stream; return err_end_of_stream; } CMsleep(fp_read_data->fp_cm, 1); send_flush_msg(fp, fp->writer_coordinator, STEP, 1); } double advclose_start = dgettimeofday(); int i=0; for(i=0; i<fp->num_bridges; i++) { if(fp->bridges[i].created && fp->bridges[i].opened) { count++; send_close_msg(fp, i); } } MPI_Barrier(fp->comm); double advclose_end = dgettimeofday(); count = 0; adiosfile->current_step++; fp->mystep = adiosfile->current_step; double advopen_start = dgettimeofday(); for(i=0; i<fp->num_bridges; i++){ if(fp->bridges[i].created && !fp->bridges[i].opened){ send_open_msg(fp, i); count++; } } double advopen_end = dgettimeofday(); // need to remove selectors from each var now. send_flush_msg(fp, fp->writer_coordinator, DATA, 1); // should only happen if there are more steps available. // writer should have advanced. double offset_start = dgettimeofday(); send_flush_msg(fp, fp->writer_coordinator, EVGROUP, 1); double offset_end = dgettimeofday(); return 0; }
int adios_read_flexpath_perform_reads(const ADIOS_FILE *adiosfile, int blocking) { fp_log("FUNC", "entering perform_reads.\n"); flexpath_reader_file * fp = (flexpath_reader_file*)adiosfile->fh; fp->data_read = 0; int i,j; int num_sendees = fp->num_sendees; int total_sent = 0; fp->time_in = 0.00; double start_poll = MPI_Wtime(); for(i = 0; i<num_sendees; i++){ pthread_mutex_lock(&fp->data_mutex); int sendee = fp->sendees[i]; fp->pending_requests++; total_sent++; send_flush_msg(fp, sendee, DATA, 0); if((total_sent % FP_BATCH_SIZE == 0) || (total_sent = num_sendees)){ pthread_cond_wait(&fp->data_condition, &fp->data_mutex); pthread_mutex_unlock(&fp->data_mutex); fp->completed_requests = 0; fp->pending_requests = 0; total_sent = 0; } } double end_poll = MPI_Wtime(); free(fp->sendees); fp->sendees = NULL; fp->num_sendees = 0; fp_log("FUNC", "leaving perform_reads.\n"); return 0; }
static inline int check_and_switch_target_state(MPIR_Win * win_ptr, MPIDI_RMA_Target_t * target, int *is_able_to_issue, int *made_progress) { int rank = win_ptr->comm_ptr->rank; int mpi_errno = MPI_SUCCESS; (*made_progress) = 0; (*is_able_to_issue) = 0; if (target == NULL) goto fn_exit; /* When user event happens, move op in user pending list to network pending list */ if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH || target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL || target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK || target->win_complete_flag) { MPIDI_RMA_Op_t *user_op = target->pending_user_ops_list_head; if (user_op != NULL) { if (target->pending_net_ops_list_head == NULL) win_ptr->num_targets_with_pending_net_ops++; DL_DELETE(target->pending_user_ops_list_head, user_op); DL_APPEND(target->pending_net_ops_list_head, user_op); if (target->next_op_to_issue == NULL) target->next_op_to_issue = user_op; } } switch (target->access_state) { case MPIDI_RMA_LOCK_CALLED: if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE || target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL || target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) { if ((target->pending_net_ops_list_head == NULL || !target->pending_net_ops_list_head->piggyback_lock_candidate) && (target->pending_user_ops_list_head == NULL || !target->pending_user_ops_list_head->piggyback_lock_candidate)) { /* issue lock request */ target->access_state = MPIDI_RMA_LOCK_ISSUED; if (target->target_rank == rank) { mpi_errno = acquire_local_lock(win_ptr, target->lock_type); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } else { mpi_errno = send_lock_msg(target->target_rank, target->lock_type, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } (*made_progress) = 1; } } else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) { if (target->pending_net_ops_list_head == NULL) { /* No RMA operation has ever been posted to this target, * finish issuing, no need to acquire the lock. Cleanup * function will clean it up. */ target->access_state = MPIDI_RMA_LOCK_GRANTED; /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } else { /* if we reach WIN_UNLOCK and there is still operation existing * in pending list, this operation must be the only operation * and it is prepared to piggyback LOCK and UNLOCK. */ MPIR_Assert(MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING); MPIR_Assert(target->pending_net_ops_list_head->next == NULL); MPIR_Assert(target->pending_net_ops_list_head->piggyback_lock_candidate); } } break; case MPIDI_RMA_LOCK_GRANTED: case MPIDI_RMA_NONE: if (target->win_complete_flag) { if (target->pending_net_ops_list_head == NULL) { MPIDI_CH3_Pkt_flags_t flags = MPIDI_CH3_PKT_FLAG_NONE; if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH && target->num_ops_flush_not_issued > 0) { flags |= MPIDI_CH3_PKT_FLAG_RMA_FLUSH; win_ptr->outstanding_acks++; target->sync.outstanding_acks++; target->num_ops_flush_not_issued = 0; } mpi_errno = send_decr_at_cnt_msg(target->target_rank, win_ptr, flags); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } } else if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) { if (target->pending_net_ops_list_head == NULL) { if (target->target_rank != rank) { if (target->num_ops_flush_not_issued > 0) { win_ptr->outstanding_acks++; target->sync.outstanding_acks++; target->num_ops_flush_not_issued = 0; mpi_errno = send_flush_msg(target->target_rank, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } } else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) { if (target->pending_net_ops_list_head == NULL) { if (target->target_rank == rank) { mpi_errno = MPIDI_CH3I_Release_lock(win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } else { MPIDI_CH3_Pkt_flags_t flag = MPIDI_CH3_PKT_FLAG_NONE; if (target->num_ops_flush_not_issued == 0) { flag = MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK; } else { win_ptr->outstanding_acks++; target->sync.outstanding_acks++; target->num_ops_flush_not_issued = 0; } mpi_errno = send_unlock_msg(target->target_rank, win_ptr, flag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } } break; default: break; } /* end of switch */ if (target->access_state != MPIDI_RMA_LOCK_ISSUED) { (*is_able_to_issue) = 1; } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
static inline int check_target_state(MPID_Win * win_ptr, MPIDI_RMA_Target_t * target, int *made_progress) { int rank = win_ptr->comm_ptr->rank; int mpi_errno = MPI_SUCCESS; (*made_progress) = 0; if (target == NULL) goto fn_exit; /* This check should only be performed when window-wide sync is finished, or * current sync is per-target sync. */ if (win_ptr->states.access_state == MPIDI_RMA_NONE || win_ptr->states.access_state == MPIDI_RMA_FENCE_ISSUED || win_ptr->states.access_state == MPIDI_RMA_PSCW_ISSUED || win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_ISSUED) { goto fn_exit; } switch (target->access_state) { case MPIDI_RMA_LOCK_CALLED: if (target->sync.sync_flag == MPIDI_RMA_SYNC_NONE || target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH_LOCAL || target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) { if (target->pending_op_list == NULL || !target->pending_op_list->piggyback_lock_candidate) { /* issue lock request */ target->access_state = MPIDI_RMA_LOCK_ISSUED; if (target->target_rank == rank) { mpi_errno = acquire_local_lock(win_ptr, target->lock_type); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } else { mpi_errno = send_lock_msg(target->target_rank, target->lock_type, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } (*made_progress) = 1; } } else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) { if (target->pending_op_list == NULL) { /* No RMA operation has ever been posted to this target, * finish issuing, no need to acquire the lock. Cleanup * function will clean it up. */ target->access_state = MPIDI_RMA_LOCK_GRANTED; target->sync.outstanding_acks--; MPIU_Assert(target->sync.outstanding_acks >= 0); /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } else { /* if we reach WIN_UNLOCK and there is still operation existing * in pending list, this operation must be the only operation * and it is prepared to piggyback LOCK and UNLOCK. */ MPIU_Assert(target->pending_op_list->next == NULL); MPIU_Assert(target->pending_op_list->piggyback_lock_candidate); } } break; case MPIDI_RMA_LOCK_GRANTED: case MPIDI_RMA_NONE: if (target->sync.sync_flag == MPIDI_RMA_SYNC_FLUSH) { if (target->pending_op_list == NULL) { if (target->target_rank == rank) { target->sync.outstanding_acks--; MPIU_Assert(target->sync.outstanding_acks >= 0); } else { if (target->put_acc_issued) { mpi_errno = send_flush_msg(target->target_rank, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } else { /* We did not issue PUT/ACC since the last * synchronization call, therefore here we * don't need ACK back */ target->sync.outstanding_acks--; MPIU_Assert(target->sync.outstanding_acks >= 0); } } /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } } else if (target->sync.sync_flag == MPIDI_RMA_SYNC_UNLOCK) { if (target->pending_op_list == NULL) { if (target->target_rank == rank) { target->sync.outstanding_acks--; MPIU_Assert(target->sync.outstanding_acks >= 0); mpi_errno = MPIDI_CH3I_Release_lock(win_ptr); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } else { MPIDI_CH3_Pkt_flags_t flag = MPIDI_CH3_PKT_FLAG_NONE; if (!target->put_acc_issued) { /* We did not issue PUT/ACC since the last * synchronization call, therefore here we * don't need ACK back */ target->sync.outstanding_acks--; MPIU_Assert(target->sync.outstanding_acks >= 0); flag = MPIDI_CH3_PKT_FLAG_RMA_UNLOCK_NO_ACK; } mpi_errno = send_unlock_msg(target->target_rank, win_ptr, flag); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } /* We are done with ending synchronization, unset target's sync_flag. */ target->sync.sync_flag = MPIDI_RMA_SYNC_NONE; (*made_progress) = 1; } } break; default: break; } /* end of switch */ fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* * Sets up local data structure for series of reads on an adios file * - create evpath graph and structures * -- create evpath control stone (outgoing) * -- create evpath data stone (incoming) * -- rank 0 dumps contact info to file * -- create connections using contact info from file */ ADIOS_FILE* adios_read_flexpath_open(const char * fname, MPI_Comm comm, enum ADIOS_LOCKMODE lock_mode, float timeout_sec) { fp_log("FUNC", "entering flexpath_open\n"); ADIOS_FILE *adiosfile = malloc(sizeof(ADIOS_FILE)); if(!adiosfile){ adios_error (err_no_memory, "Cannot allocate memory for file info.\n"); return NULL; } flexpath_reader_file *fp = new_flexpath_reader_file(fname); adios_errno = 0; fp->stone = EValloc_stone(fp_read_data->fp_cm); fp->comm = comm; MPI_Comm_size(fp->comm, &(fp->size)); MPI_Comm_rank(fp->comm, &(fp->rank)); EVassoc_terminal_action(fp_read_data->fp_cm, fp->stone, op_format_list, op_msg_handler, adiosfile); EVassoc_terminal_action(fp_read_data->fp_cm, fp->stone, update_step_msg_format_list, update_step_msg_handler, adiosfile); EVassoc_terminal_action(fp_read_data->fp_cm, fp->stone, evgroup_format_list, group_msg_handler, adiosfile); EVassoc_raw_terminal_action(fp_read_data->fp_cm, fp->stone, raw_handler, adiosfile); /* Gather the contact info from the other readers and write it to a file. Create a ready file so that the writer knows it can parse this file. */ double setup_start = dgettimeofday(); char writer_ready_filename[200]; char writer_info_filename[200]; char reader_ready_filename[200]; char reader_info_filename[200]; sprintf(reader_ready_filename, "%s_%s", fname, READER_READY_FILE); sprintf(reader_info_filename, "%s_%s", fname, READER_CONTACT_FILE); sprintf(writer_ready_filename, "%s_%s", fname, WRITER_READY_FILE); sprintf(writer_info_filename, "%s_%s", fname, WRITER_CONTACT_FILE); char *string_list; char data_contact_info[CONTACT_LENGTH]; string_list = attr_list_to_string(CMget_contact_list(fp_read_data->fp_cm)); sprintf(&data_contact_info[0], "%d:%s", fp->stone, string_list); free(string_list); char * recvbuf; if(fp->rank == 0){ recvbuf = (char*)malloc(sizeof(char)*CONTACT_LENGTH*(fp->size)); } MPI_Gather(data_contact_info, CONTACT_LENGTH, MPI_CHAR, recvbuf, CONTACT_LENGTH, MPI_CHAR, 0, fp->comm); if(fp->rank == 0){ // print our own contact information FILE * fp_out = fopen(reader_info_filename, "w"); int i; if(!fp_out){ adios_error(err_file_open_error, "File for contact info could not be opened for writing.\n"); exit(1); } for(i=0; i<fp->size; i++) { fprintf(fp_out,"%s\n", &recvbuf[i*CONTACT_LENGTH]); } fclose(fp_out); free(recvbuf); FILE * read_ready = fopen(reader_ready_filename, "w"); fprintf(read_ready, "ready"); fclose(read_ready); } MPI_Barrier(fp->comm); FILE * fp_in = fopen(writer_ready_filename,"r"); while(!fp_in) { //CMsleep(fp_read_data->fp_cm, 1); fp_in = fopen(writer_ready_filename, "r"); } fclose(fp_in); fp_in = fopen(writer_info_filename, "r"); while(!fp_in){ //CMsleep(fp_read_data->fp_cm, 1); fp_in = fopen(writer_info_filename, "r"); } char in_contact[CONTACT_LENGTH] = ""; //fp->bridges = malloc(sizeof(bridge_info)); int num_bridges = 0; int their_stone; // change to read all numbers, dont create stones, turn bridge array into linked list while(fscanf(fp_in, "%d:%s", &their_stone, in_contact) != EOF){ //fprintf(stderr, "writer contact: %d:%s\n", their_stone, in_contact); fp->bridges = realloc(fp->bridges, sizeof(bridge_info) * (num_bridges+1)); fp->bridges[num_bridges].their_num = their_stone; fp->bridges[num_bridges].contact = strdup(in_contact); fp->bridges[num_bridges].created = 0; fp->bridges[num_bridges].step = 0; fp->bridges[num_bridges].opened = 0; fp->bridges[num_bridges].scheduled = 0; num_bridges++; } fclose(fp_in); fp->num_bridges = num_bridges; // clean up of writer's files MPI_Barrier(fp->comm); if(fp->rank == 0){ unlink(writer_info_filename); unlink(writer_ready_filename); } adiosfile->fh = (uint64_t)fp; adiosfile->current_step = 0; /* Init with a writer to get initial scalar data so we can handle inq_var calls and also populate the ADIOS_FILE struct. */ double bridge_start = MPI_Wtime(); if(fp->size < num_bridges){ int mystart = (num_bridges/fp->size) * fp->rank; int myend = (num_bridges/fp->size) * (fp->rank+1); fp->writer_coordinator = mystart; int z; for(z=mystart; z<myend; z++){ build_bridge(&fp->bridges[z]); } } else{ int writer_rank = fp->rank % num_bridges; build_bridge(&fp->bridges[writer_rank]); fp->writer_coordinator = writer_rank; } // requesting initial data. send_open_msg(fp, fp->writer_coordinator); fp->data_read = 0; send_flush_msg(fp, fp->writer_coordinator, DATA, 1); send_flush_msg(fp, fp->writer_coordinator, EVGROUP, 1); fp->data_read = 0; // this has to change. Writer needs to have some way of // taking the attributes out of the xml document // and sending them over ffs encoded. Not yet implemented. // the rest of this info for adiosfile gets filled in raw_handler. adiosfile->nattrs = 0; adiosfile->attr_namelist = NULL; // first step is at least one, otherwise raw_handler will not execute. // in reality, writer might be further along, so we might have to make // the writer explitly send across messages each time it calls close, to // indicate which timesteps are available. adiosfile->last_step = 1; adiosfile->path = strdup(fname); // verifies these two fields. It's not BP, so no BP version. // It's a stream, so how can the file size be known? adiosfile->version = -1; adiosfile->file_size = 0; adios_errno = err_no_error; fp_log("FUNC", "leaving flexpath_open\n"); return adiosfile; }