int NBC_Init_handle(struct ompi_communicator_t *comm, ompi_coll_libnbc_request_t **request, ompi_coll_libnbc_module_t *comminfo) { int tmp_tag; bool need_register = false; ompi_coll_libnbc_request_t *handle; OMPI_COLL_LIBNBC_REQUEST_ALLOC(comm, handle); if (NULL == handle) return OMPI_ERR_OUT_OF_RESOURCE; *request = handle; handle->tmpbuf = NULL; handle->req_count = 0; handle->req_array = NULL; handle->comm = comm; handle->schedule = NULL; /* first int is the schedule size */ handle->row_offset = sizeof(int); /******************** Do the tag and shadow comm administration ... ***************/ OPAL_THREAD_LOCK(&comminfo->mutex); tmp_tag = comminfo->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = comminfo->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } if (true != comminfo->comm_registered) { comminfo->comm_registered = true; need_register = true; } OPAL_THREAD_UNLOCK(&comminfo->mutex); handle->tag=comminfo->tag; /* register progress */ if (need_register) { int32_t tmp = OPAL_THREAD_ADD32(&mca_coll_libnbc_component.active_comms, 1); if (tmp == 1) { opal_progress_register(ompi_coll_libnbc_progress); } } handle->comm=comm; /*printf("got comminfo: %lu tag: %i\n", comminfo, comminfo->tag);*/ /******************** end of tag and shadow comm administration ... ***************/ handle->comminfo = comminfo; NBC_DEBUG(3, "got tag %i\n", handle->tag); return NBC_OK; }
/* this function puts a copy into the schedule */ int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier) { NBC_Args_copy copy_args; int ret; /* store the passed arguments */ copy_args.type = COPY; copy_args.src = src; copy_args.tmpsrc = tmpsrc; copy_args.srccount = srccount; copy_args.srctype = srctype; copy_args.tgt = tgt; copy_args.tmptgt = tmptgt; copy_args.tgtcount = tgtcount; copy_args.tgttype = tgttype; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, ©_args, sizeof (copy_args), barrier); if (OMPI_SUCCESS != ret) { return ret; } NBC_DEBUG(10, "added copy - ends at byte %i\n", nbc_schedule_get_size (schedule)); return OMPI_SUCCESS; }
/* this function puts an operation into the schedule */ int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier) { NBC_Args_op op_args; int ret; /* store the passed arguments */ op_args.type = OP; op_args.buf1 = buf1; op_args.buf2 = buf2; op_args.tmpbuf1 = tmpbuf1; op_args.tmpbuf2 = tmpbuf2; op_args.count = count; op_args.op = op; op_args.datatype = datatype; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, &op_args, sizeof (op_args), barrier); if (OMPI_SUCCESS != ret) { return ret; } NBC_DEBUG(10, "added op2 - ends at byte %i\n", nbc_schedule_get_size (schedule)); return OMPI_SUCCESS; }
/* this function puts a unpack into the schedule */ int NBC_Sched_unpack(void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule) { int size; char* ptr; NBC_Fn_type type = UNPACK; NBC_Args_unpack unpack_args; /* get size of actual schedule */ NBC_GET_SIZE(*schedule, size); /*printf("schedule is %i bytes\n", size);*/ *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_unpack)); if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ unpack_args.inbuf=inbuf; unpack_args.tmpinbuf=tmpinbuf; unpack_args.count=count; unpack_args.datatype=datatype; unpack_args.outbuf=outbuf; unpack_args.tmpoutbuf=tmpoutbuf; /* append to the round-schedule */ ptr = (char*)*schedule + size; NBC_PUT_BYTES(ptr,type); NBC_PUT_BYTES(ptr,unpack_args); /* increase number of elements in round-schedule */ NBC_INC_NUM_ROUND(*schedule); NBC_DEBUG(10, "adding unpack - ends at byte %i\n", (int)(size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_unpack))); /* increase size of schedule */ NBC_INC_SIZE(*schedule, sizeof(NBC_Fn_type)+sizeof(NBC_Args_unpack)); return NBC_OK; }
/* this function puts a receive into the schedule */ int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule) { int size; char* ptr; NBC_Fn_type type = RECV; NBC_Args_recv recv_args; /* get size of actual schedule */ NBC_GET_SIZE(*schedule, size); /*printf("schedule is %i bytes\n", size);*/ *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_recv)); if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ recv_args.buf=buf; recv_args.tmpbuf=tmpbuf; recv_args.count=count; recv_args.datatype=datatype; recv_args.source=source; /* append to the round-schedule */ ptr = (char*)*schedule + size; NBC_PUT_BYTES(ptr,type); NBC_PUT_BYTES(ptr,recv_args); /* increase number of elements in round-schedule */ NBC_INC_NUM_ROUND(*schedule); NBC_DEBUG(10, "adding receive - ends at byte %i\n", (int)(size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_recv))); /* increase size of schedule */ NBC_INC_SIZE(*schedule, sizeof(NBC_Fn_type)+sizeof(NBC_Args_recv)); return NBC_OK; }
/* executes element and returns SCHED_DONE if th operation is finished * and SCHED_CONTINUE and a request handle if the operations is running */ static inline int execute_element(char type, void *val, sched_req *req, GOAL_Handle handle) { switch(type) { case T_SEND: { NBC_Args_send *args=(NBC_Args_send*) val; NBC_DEBUG(5,"SEND (val: %lu) *buf: %p, count: %i, dest: %i, tag: %i)\n", (unsigned long)val, args->buf, args->count, args->dest, handle->tag); void *buf1; if(args->memtype == GOAL_SCRATCHPAD) buf1=(char*)handle->tmpbuf+(long)args->buf; else buf1=args->buf; int res = MPI_Isend(buf1, args->count, MPI_BYTE, args->dest, handle->tag, G_GOAL_WorldComm, req); if(MPI_SUCCESS != res) { printf("Error in MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, args->count, 0UL, args->dest, handle->tag, (unsigned long)G_GOAL_WorldComm, res); goto error; } return SCHED_CONTINUE; break; } case T_RECV: { NBC_Args_recv *args=(NBC_Args_recv*) val; NBC_DEBUG(5,"RECV (val: %lu) *buf: %p, count: %i, src: %i, tag: %i)\n", (unsigned long)val, args->buf, args->count, args->source, handle->tag); void *buf1; if(args->memtype == GOAL_SCRATCHPAD) buf1=(char*)handle->tmpbuf+(long)args->buf; else buf1=args->buf; int res = MPI_Irecv(buf1, args->count, MPI_BYTE, args->source, handle->tag, G_GOAL_WorldComm, req); if(MPI_SUCCESS != res) { printf("Error in MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, args->count, 0UL, args->source, handle->tag, (unsigned long)G_GOAL_WorldComm, res); goto error; } return SCHED_CONTINUE; break; } case T_OP: { NBC_Args_op* args = (NBC_Args_op*) val; if (args->opnum < 0) execute_predefined_op(args->opnum, &args->args, handle->tmpbuf); else args->func(&args->args, handle->tmpbuf); return SCHED_CONTINUE; break; } default: printf("[print_type_content] type %i not supported!\n", (int)type); break; } error: return SCHED_ERR; }
int NBC_Init_comm(MPI_Comm comm, NBC_Comminfo *comminfo) { comminfo->tag= MCA_COLL_BASE_TAG_NONBLOCKING_BASE; #ifdef NBC_CACHE_SCHEDULE /* initialize the NBC_ALLTOALL SchedCache tree */ comminfo->NBC_Dict[NBC_ALLTOALL] = hb_tree_new((dict_cmp_func)NBC_Alltoall_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_ALLTOALL] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_ALLTOALL]); comminfo->NBC_Dict_size[NBC_ALLTOALL] = 0; /* initialize the NBC_ALLGATHER SchedCache tree */ comminfo->NBC_Dict[NBC_ALLGATHER] = hb_tree_new((dict_cmp_func)NBC_Allgather_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_ALLGATHER] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_ALLGATHER]); comminfo->NBC_Dict_size[NBC_ALLGATHER] = 0; /* initialize the NBC_ALLREDUCE SchedCache tree */ comminfo->NBC_Dict[NBC_ALLREDUCE] = hb_tree_new((dict_cmp_func)NBC_Allreduce_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_ALLREDUCE] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_ALLREDUCE]); comminfo->NBC_Dict_size[NBC_ALLREDUCE] = 0; /* initialize the NBC_BARRIER SchedCache tree - is not needed - * schedule is hung off directly */ comminfo->NBC_Dict_size[NBC_BARRIER] = 0; /* initialize the NBC_BCAST SchedCache tree */ comminfo->NBC_Dict[NBC_BCAST] = hb_tree_new((dict_cmp_func)NBC_Bcast_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_BCAST] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_BCAST]); comminfo->NBC_Dict_size[NBC_BCAST] = 0; /* initialize the NBC_GATHER SchedCache tree */ comminfo->NBC_Dict[NBC_GATHER] = hb_tree_new((dict_cmp_func)NBC_Gather_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_GATHER] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_GATHER]); comminfo->NBC_Dict_size[NBC_GATHER] = 0; /* initialize the NBC_REDUCE SchedCache tree */ comminfo->NBC_Dict[NBC_REDUCE] = hb_tree_new((dict_cmp_func)NBC_Reduce_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_REDUCE] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_REDUCE]); comminfo->NBC_Dict_size[NBC_REDUCE] = 0; /* initialize the NBC_SCAN SchedCache tree */ comminfo->NBC_Dict[NBC_SCAN] = hb_tree_new((dict_cmp_func)NBC_Scan_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_SCAN] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_SCAN]); comminfo->NBC_Dict_size[NBC_SCAN] = 0; /* initialize the NBC_SCATTER SchedCache tree */ comminfo->NBC_Dict[NBC_SCATTER] = hb_tree_new((dict_cmp_func)NBC_Scatter_args_compare, NBC_SchedCache_args_delete_key_dummy, NBC_SchedCache_args_delete); if(comminfo->NBC_Dict[NBC_SCATTER] == NULL) { printf("Error in hb_tree_new()\n"); return OMPI_ERROR;; } NBC_DEBUG(1, "added tree at address %lu\n", (unsigned long)comminfo->NBC_Dict[NBC_SCATTER]); comminfo->NBC_Dict_size[NBC_SCATTER] = 0; #endif return OMPI_SUCCESS; }
/* this function ends a schedule */ int NBC_Sched_commit(NBC_Schedule *schedule) { int size; /* get size of actual schedule */ NBC_GET_SIZE(*schedule, size); /*printf("schedule terminated at %i bytes\n", size);*/ *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(char)); if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* add the barrier char (0) because this is the last round */ *(char*)((char*)*schedule+size)=0; NBC_DEBUG(10, "closing schedule %p at byte %i\n", *schedule, (int)(size+sizeof(char))); /* increase size of schedule */ NBC_INC_SIZE(*schedule, sizeof(char)); return NBC_OK; }
/* this function ends a schedule */ int NBC_Sched_commit(NBC_Schedule *schedule) { int size = nbc_schedule_get_size (schedule); char *ptr; int ret; ret = nbc_schedule_grow (schedule, 1); if (OMPI_SUCCESS != ret) { return ret; } /* add the barrier char (0) because this is the last round */ ptr = schedule->data + size; *((char *) ptr) = 0; /* increase size of schedule */ nbc_schedule_inc_size (schedule, 1); NBC_DEBUG(10, "closed schedule %p at byte %i\n", schedule, (int)(size + 1)); return OMPI_SUCCESS; }
/* this function ends a round of a schedule */ int NBC_Sched_barrier(NBC_Schedule *schedule) { int size, num = 0; char *ptr; char delimiter = 1; /* get size of actual schedule */ NBC_GET_SIZE(*schedule, size); /*printf("round terminated at %i bytes\n", size);*/ *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(char)+sizeof(int)); if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } ptr = (char*)*schedule + size; NBC_PUT_BYTES(ptr,delimiter); /* round-schedule delimiter */ NBC_PUT_BYTES(ptr,num); /* initialize num=0 for next round-schedule */ NBC_DEBUG(10, "ending round at byte %i\n", (int)(size+sizeof(char)+sizeof(int))); /* increase size of schedule */ NBC_INC_SIZE(*schedule, sizeof(char)+sizeof(int)); return NBC_OK; }
static int nbc_schedule_round_append (NBC_Schedule *schedule, void *data, int data_size, bool barrier) { int ret, size = nbc_schedule_get_size (schedule); if (barrier) { ret = nbc_schedule_grow (schedule, data_size + 1 + sizeof (int)); } else { ret = nbc_schedule_grow (schedule, data_size); } if (OMPI_SUCCESS != ret) { return ret; } /* append to the round-schedule */ if (data_size) { memcpy (schedule->data + size, data, data_size); /* increase number of elements in round-schedule */ nbc_schedule_inc_round (schedule); /* increase size of schedule */ nbc_schedule_inc_size (schedule, data_size); } if (barrier) { /* add the barrier */ schedule->data[size + data_size] = 1; /* set next round counter to 0 */ memset (schedule->data + size + data_size + 1, 0, sizeof (int)); NBC_DEBUG(10, "ended round at byte %i\n", size + data_size + 1); schedule->current_round_offset = size + data_size + 1; /* increase size of schedule */ nbc_schedule_inc_size (schedule, sizeof (int) + 1); } return OMPI_SUCCESS; }
/* this function puts a receive into the schedule */ static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) { NBC_Args_recv recv_args; int ret; /* store the passed arguments */ recv_args.type = RECV; recv_args.buf = buf; recv_args.tmpbuf = tmpbuf; recv_args.count = count; recv_args.datatype = datatype; recv_args.source = source; recv_args.local = local; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, &recv_args, sizeof (recv_args), barrier); if (OMPI_SUCCESS != ret) { return ret; } NBC_DEBUG(10, "added receive - ends at byte %d\n", nbc_schedule_get_size (schedule)); return OMPI_SUCCESS; }
/* this function puts a send into the schedule */ static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) { NBC_Args_send send_args; int ret; /* store the passed arguments */ send_args.type = SEND; send_args.buf = buf; send_args.tmpbuf = tmpbuf; send_args.count = count; send_args.datatype = datatype; send_args.dest = dest; send_args.local = local; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, &send_args, sizeof (send_args), barrier); if (OMPI_SUCCESS != ret) { return ret; } NBC_DEBUG(10, "added send - ends at byte %i\n", nbc_schedule_get_size (schedule)); return OMPI_SUCCESS; }
/* this function puts an operation into the schedule */ int NBC_Sched_op(void *buf3, char tmpbuf3, void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule) { int size; char* ptr; NBC_Fn_type type = OP; NBC_Args_op op_args; /* get size of actual schedule */ NBC_GET_SIZE(*schedule, size); /*printf("schedule is %i bytes\n", size);*/ *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_op)); if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ op_args.buf1=buf1; op_args.buf2=buf2; op_args.buf3=buf3; op_args.tmpbuf1=tmpbuf1; op_args.tmpbuf2=tmpbuf2; op_args.tmpbuf3=tmpbuf3; op_args.count=count; op_args.op=op; op_args.datatype=datatype; /* append to the round-schedule */ ptr = (char*)*schedule + size; NBC_PUT_BYTES(ptr,type); NBC_PUT_BYTES(ptr,op_args); /* increase number of elements in round-schedule */ NBC_INC_NUM_ROUND(*schedule); NBC_DEBUG(10, "adding op - ends at byte %i\n", (int)(size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_op))); /* increase size of schedule */ NBC_INC_SIZE(*schedule, sizeof(NBC_Fn_type)+sizeof(NBC_Args_op)); return NBC_OK; }
/* this function puts a copy into the schedule */ int NBC_Sched_copy(void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule) { int size; char* ptr; NBC_Fn_type type = COPY; NBC_Args_copy copy_args; /* get size of actual schedule */ NBC_GET_SIZE(*schedule, size); /*printf("schedule is %i bytes\n", size);*/ *schedule = (NBC_Schedule)realloc(*schedule, size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_copy)); if(*schedule == NULL) { printf("Error in realloc()\n"); return NBC_OOR; } /* store the passed arguments */ copy_args.src=src; copy_args.tmpsrc=tmpsrc; copy_args.srccount=srccount; copy_args.srctype=srctype; copy_args.tgt=tgt; copy_args.tmptgt=tmptgt; copy_args.tgtcount=tgtcount; copy_args.tgttype=tgttype; /* append to the round-schedule */ ptr = (char*)*schedule + size; NBC_PUT_BYTES(ptr,type); NBC_PUT_BYTES(ptr,copy_args); /* increase number of elements in round-schedule */ NBC_INC_NUM_ROUND(*schedule); NBC_DEBUG(10, "adding copy - ends at byte %i\n", (int)(size+sizeof(NBC_Fn_type)+sizeof(NBC_Args_copy))); /* increase size of schedule */ NBC_INC_SIZE(*schedule, sizeof(NBC_Fn_type)+sizeof(NBC_Args_copy)); return NBC_OK; }
/* this function puts a unpack into the schedule */ int NBC_Sched_unpack (void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule, bool barrier) { NBC_Args_unpack unpack_args; int ret; /* store the passed arguments */ unpack_args.type = UNPACK; unpack_args.inbuf = inbuf; unpack_args.tmpinbuf = tmpinbuf; unpack_args.count = count; unpack_args.datatype = datatype; unpack_args.outbuf = outbuf; unpack_args.tmpoutbuf = tmpoutbuf; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, &unpack_args, sizeof (unpack_args), barrier); if (OMPI_SUCCESS != ret) { return ret; } NBC_DEBUG(10, "added unpack - ends at byte %i\n", nbc_schedule_get_size (schedule)); return OMPI_SUCCESS; }
/* progresses a request * * to be called *only* from the progress thread !!! */ int NBC_Progress(NBC_Handle *handle) { int res, ret=NBC_CONTINUE; bool flag; unsigned long size = 0; char *delim; if (handle->nbc_complete) { return NBC_OK; } flag = true; if ((handle->req_count > 0) && (handle->req_array != NULL)) { NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count); #ifdef NBC_TIMING Test_time -= MPI_Wtime(); #endif /* don't call ompi_request_test_all as it causes a recursive call into opal_progress */ while (handle->req_count) { ompi_request_t *subreq = handle->req_array[handle->req_count - 1]; if (REQUEST_COMPLETE(subreq)) { if(OPAL_UNLIKELY( OMPI_SUCCESS != subreq->req_status.MPI_ERROR )) { NBC_Error ("MPI Error in NBC subrequest %p : %d", subreq, subreq->req_status.MPI_ERROR); /* copy the error code from the underlying request and let the * round finish */ handle->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR; } handle->req_count--; ompi_request_free(&subreq); } else { flag = false; break; } } #ifdef NBC_TIMING Test_time += MPI_Wtime(); #endif } /* a round is finished */ if (flag) { /* reset handle for next round */ if (NULL != handle->req_array) { /* free request array */ free (handle->req_array); handle->req_array = NULL; } handle->req_count = 0; /* previous round had an error */ if (OPAL_UNLIKELY(OMPI_SUCCESS != handle->super.req_status.MPI_ERROR)) { res = handle->super.req_status.MPI_ERROR; NBC_Error("NBC_Progress: an error %d was found during schedule %p at row-offset %li - aborting the schedule\n", res, handle->schedule, handle->row_offset); handle->nbc_complete = true; if (!handle->super.req_persistent) { NBC_Free(handle); } return res; } /* adjust delim to start of current round */ NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset); delim = handle->schedule->data + handle->row_offset; NBC_DEBUG(10, "delim: %p\n", delim); nbc_get_round_size(delim, &size); NBC_DEBUG(10, "size: %li\n", size); /* adjust delim to end of current round -> delimiter */ delim = delim + size; if (*delim == 0) { /* this was the last round - we're done */ NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n"); handle->nbc_complete = true; if (!handle->super.req_persistent) { NBC_Free(handle); } return NBC_OK; } NBC_DEBUG(5, "NBC_Progress round finished - goto next round\n"); /* move delim to start of next round */ /* initializing handle for new virgin round */ handle->row_offset = (intptr_t) (delim + 1) - (intptr_t) handle->schedule->data; /* kick it off */ res = NBC_Start_round(handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Error ("Error in NBC_Start_round() (%i)", res); return res; } } return ret; }
static inline int NBC_Start_round(NBC_Handle *handle) { int num; /* number of operations */ int res; char* ptr; MPI_Request *tmp; NBC_Fn_type type; NBC_Args_send sendargs; NBC_Args_recv recvargs; NBC_Args_op opargs; NBC_Args_copy copyargs; NBC_Args_unpack unpackargs; void *buf1, *buf2; /* get round-schedule address */ ptr = handle->schedule->data + handle->row_offset; NBC_GET_BYTES(ptr,num); NBC_DEBUG(10, "start_round round at offset %d : posting %i operations\n", handle->row_offset, num); for (int i = 0 ; i < num ; ++i) { int offset = (intptr_t)(ptr - handle->schedule->data); memcpy (&type, ptr, sizeof (type)); switch(type) { case SEND: NBC_DEBUG(5," SEND (offset %li) ", offset); NBC_GET_BYTES(ptr,sendargs); NBC_DEBUG(5,"*buf: %p, count: %i, type: %p, dest: %i, tag: %i)\n", sendargs.buf, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag); /* get an additional request */ handle->req_count++; /* get buffer */ if(sendargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)sendargs.buf; } else { buf1=(void *)sendargs.buf; } #ifdef NBC_TIMING Isend_time -= MPI_Wtime(); #endif tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); if (NULL == tmp) { return OMPI_ERR_OUT_OF_RESOURCE; } handle->req_array = tmp; res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count - 1)); if (OMPI_SUCCESS != res) { NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); return res; } #ifdef NBC_TIMING Isend_time += MPI_Wtime(); #endif break; case RECV: NBC_DEBUG(5, " RECV (offset %li) ", offset); NBC_GET_BYTES(ptr,recvargs); NBC_DEBUG(5, "*buf: %p, count: %i, type: %p, source: %i, tag: %i)\n", recvargs.buf, recvargs.count, recvargs.datatype, recvargs.source, handle->tag); /* get an additional request - TODO: req_count NOT thread safe */ handle->req_count++; /* get buffer */ if(recvargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)recvargs.buf; } else { buf1=recvargs.buf; } #ifdef NBC_TIMING Irecv_time -= MPI_Wtime(); #endif tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request)); if (NULL == tmp) { return OMPI_ERR_OUT_OF_RESOURCE; } handle->req_array = tmp; res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count-1)); if (OMPI_SUCCESS != res) { NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); return res; } #ifdef NBC_TIMING Irecv_time += MPI_Wtime(); #endif break; case OP: NBC_DEBUG(5, " OP2 (offset %li) ", offset); NBC_GET_BYTES(ptr,opargs); NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, opargs.count, opargs.datatype); /* get buffers */ if(opargs.tmpbuf1) { buf1=(char*)handle->tmpbuf+(long)opargs.buf1; } else { buf1=(void *)opargs.buf1; } if(opargs.tmpbuf2) { buf2=(char*)handle->tmpbuf+(long)opargs.buf2; } else { buf2=opargs.buf2; } ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype); break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", offset); NBC_GET_BYTES(ptr,copyargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu, tgtcount: %i, tgttype: %p)\n", (unsigned long) copyargs.src, copyargs.srccount, copyargs.srctype, (unsigned long) copyargs.tgt, copyargs.tgtcount, copyargs.tgttype); /* get buffers */ if(copyargs.tmpsrc) { buf1=(char*)handle->tmpbuf+(long)copyargs.src; } else { buf1=copyargs.src; } if(copyargs.tmptgt) { buf2=(char*)handle->tmpbuf+(long)copyargs.tgt; } else { buf2=copyargs.tgt; } res = NBC_Copy (buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype, handle->comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } break; case UNPACK: NBC_DEBUG(5, " UNPACK (offset %li) ", offset); NBC_GET_BYTES(ptr,unpackargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu\n", (unsigned long) unpackargs.inbuf, unpackargs.count, unpackargs.datatype, (unsigned long) unpackargs.outbuf); /* get buffers */ if(unpackargs.tmpinbuf) { buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf; } else { buf1=unpackargs.inbuf; } if(unpackargs.tmpoutbuf) { buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf; } else { buf2=unpackargs.outbuf; } res = NBC_Unpack (buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm); if (OMPI_SUCCESS != res) { NBC_Error ("NBC_Unpack() failed (code: %i)", res); return res; } break; default: NBC_Error ("NBC_Start_round: bad type %li at offset %li", (long)type, offset); return OMPI_ERROR; } } /* check if we can make progress - not in the first round, this allows us to leave the * initialization faster and to reach more overlap * * threaded case: calling progress in the first round can lead to a * deadlock if NBC_Free is called in this round :-( */ if (handle->row_offset) { res = NBC_Progress(handle); if ((NBC_OK != res) && (NBC_CONTINUE != res)) { return OMPI_ERROR; } } return OMPI_SUCCESS; }
/* progresses a request * * to be called *only* from the progress thread !!! */ int NBC_Progress(NBC_Handle *handle) { int flag, res, ret=NBC_CONTINUE; unsigned long size; char *delim; /* the handle is done if there is no schedule attached */ if (NULL == handle->schedule) { return NBC_OK; } if ((handle->req_count > 0) && (handle->req_array != NULL)) { NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count); #ifdef NBC_TIMING Test_time -= MPI_Wtime(); #endif res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE); if(res != OMPI_SUCCESS) { NBC_Error ("MPI Error in MPI_Testall() (%i)", res); return res; } #ifdef NBC_TIMING Test_time += MPI_Wtime(); #endif } else { flag = 1; /* we had no open requests -> proceed to next round */ } /* a round is finished */ if (flag) { /* adjust delim to start of current round */ NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset); delim = handle->schedule->data + handle->row_offset; NBC_DEBUG(10, "delim: %p\n", delim); nbc_get_round_size(delim, &size); NBC_DEBUG(10, "size: %li\n", size); /* adjust delim to end of current round -> delimiter */ delim = delim + size; if (NULL != handle->req_array) { /* free request array */ free (handle->req_array); handle->req_array = NULL; } handle->req_count = 0; if (*delim == 0) { /* this was the last round - we're done */ NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n"); NBC_Free(handle); return NBC_OK; } NBC_DEBUG(5, "NBC_Progress round finished - goto next round\n"); /* move delim to start of next round */ /* initializing handle for new virgin round */ handle->row_offset = (intptr_t) (delim + 1) - (intptr_t) handle->schedule->data; /* kick it off */ res = NBC_Start_round(handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Error ("Error in NBC_Start_round() (%i)", res); return res; } } return ret; }
/* progresses a request * * to be called *only* from the progress thread !!! */ int NBC_Progress(NBC_Handle *handle) { int flag, res, ret=NBC_CONTINUE; long size; char *delim; /* the handle is done if there is no schedule attached */ if(handle->schedule != NULL) { if((handle->req_count > 0) && (handle->req_array != NULL)) { NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count); #ifdef NBC_TIMING Test_time -= MPI_Wtime(); #endif res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE); if(res != OMPI_SUCCESS) { printf("MPI Error in MPI_Testall() (%i)\n", res); ret=res; goto error; } #ifdef NBC_TIMING Test_time += MPI_Wtime(); #endif } else { flag = 1; /* we had no open requests -> proceed to next round */ } /* a round is finished */ if(flag) { /* adjust delim to start of current round */ NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", *handle->schedule, handle->row_offset); delim = (char*)*handle->schedule + handle->row_offset; NBC_DEBUG(10, "delim: %p\n", delim); NBC_GET_ROUND_SIZE(delim, size); NBC_DEBUG(10, "size: %li\n", size); /* adjust delim to end of current round -> delimiter */ delim = delim + size; if(handle->req_array != NULL) { /* free request array */ free((void*)handle->req_array); handle->req_array = NULL; } handle->req_count = 0; if(*delim == 0) { /* this was the last round - we're done */ NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n"); res = NBC_Free(handle); if((NBC_OK != res)) { printf("Error in NBC_Free() (%i)\n", res); ret=res; goto error; } return NBC_OK; } else { NBC_DEBUG(5, "NBC_Progress round finished - goto next round\n"); /* move delim to start of next round */ delim = delim+1; /* initializing handle for new virgin round */ handle->row_offset = (long)delim - (long)*handle->schedule; /* kick it off */ res = NBC_Start_round(handle); if(NBC_OK != res) { printf("Error in NBC_Start_round() (%i)\n", res); ret=res; goto error; } } } } else { ret= NBC_OK; } error: return ret; }
static inline int NBC_Start_round(NBC_Handle *handle) { int num; /* number of operations */ int i, res, ret=NBC_OK; char* ptr; NBC_Fn_type type; NBC_Args_send sendargs; NBC_Args_recv recvargs; NBC_Args_op opargs; NBC_Args_copy copyargs; NBC_Args_unpack unpackargs; NBC_Schedule myschedule; void *buf1, *buf2, *buf3; /* get round-schedule address */ myschedule = (NBC_Schedule*)((char*)*handle->schedule + handle->row_offset); ptr = (char*) myschedule; NBC_GET_BYTES(ptr,num); NBC_DEBUG(10, "start_round round at address %p : posting %i operations\n", myschedule, num); for (i=0; i<num; i++) { NBC_GET_BYTES(ptr,type); switch(type) { case SEND: NBC_DEBUG(5," SEND (offset %li) ", (long)ptr-(long)myschedule); NBC_GET_BYTES(ptr,sendargs); NBC_DEBUG(5,"*buf: %p, count: %i, type: %lu, dest: %i, tag: %i)\n", sendargs.buf, sendargs.count, (unsigned long)sendargs.datatype, sendargs.dest, handle->tag); /* get an additional request */ handle->req_count++; /* get buffer */ if(sendargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)sendargs.buf; } else { buf1=sendargs.buf; } #ifdef NBC_TIMING Isend_time -= MPI_Wtime(); #endif handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request)); NBC_CHECK_NULL(handle->req_array); res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, MCA_PML_BASE_SEND_STANDARD, handle->comm, handle->req_array+handle->req_count-1)); if(OMPI_SUCCESS != res) { printf("Error in MPI_Isend(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, sendargs.count, (unsigned long)sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; } #ifdef NBC_TIMING Isend_time += MPI_Wtime(); #endif break; case RECV: NBC_DEBUG(5, " RECV (offset %li) ", (long)ptr-(long)myschedule); NBC_GET_BYTES(ptr,recvargs); NBC_DEBUG(5, "*buf: %p, count: %i, type: %lu, source: %i, tag: %i)\n", recvargs.buf, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source, handle->tag); /* get an additional request - TODO: req_count NOT thread safe */ handle->req_count++; /* get buffer */ if(recvargs.tmpbuf) { buf1=(char*)handle->tmpbuf+(long)recvargs.buf; } else { buf1=recvargs.buf; } #ifdef NBC_TIMING Irecv_time -= MPI_Wtime(); #endif handle->req_array = (MPI_Request*)realloc((void*)handle->req_array, (handle->req_count)*sizeof(MPI_Request)); NBC_CHECK_NULL(handle->req_array); res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, handle->comm, handle->req_array+handle->req_count-1)); if(OMPI_SUCCESS != res) { printf("Error in MPI_Irecv(%lu, %i, %lu, %i, %i, %lu) (%i)\n", (unsigned long)buf1, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res); ret=res; goto error; } #ifdef NBC_TIMING Irecv_time += MPI_Wtime(); #endif break; case OP: NBC_DEBUG(5, " OP (offset %li) ", (long)ptr-(long)myschedule); NBC_GET_BYTES(ptr,opargs); NBC_DEBUG(5, "*buf1: %p, buf2: %p, buf3: %p, count: %i, type: %lu)\n", opargs.buf1, opargs.buf2, opargs.buf3, opargs.count, (unsigned long)opargs.datatype); /* get buffers */ if(opargs.tmpbuf1) { buf1=(char*)handle->tmpbuf+(long)opargs.buf1; } else { buf1=opargs.buf1; } if(opargs.tmpbuf2) { buf2=(char*)handle->tmpbuf+(long)opargs.buf2; } else { buf2=opargs.buf2; } if(opargs.tmpbuf3) { buf3=(char*)handle->tmpbuf+(long)opargs.buf3; } else { buf3=opargs.buf3; } ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype); break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", (long)ptr-(long)myschedule); NBC_GET_BYTES(ptr,copyargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu, tgtcount: %i, tgttype: %lu)\n", (unsigned long)copyargs.src, copyargs.srccount, (unsigned long)copyargs.srctype, (unsigned long)copyargs.tgt, copyargs.tgtcount, (unsigned long)copyargs.tgttype); /* get buffers */ if(copyargs.tmpsrc) { buf1=(char*)handle->tmpbuf+(long)copyargs.src; } else { buf1=copyargs.src; } if(copyargs.tmptgt) { buf2=(char*)handle->tmpbuf+(long)copyargs.tgt; } else { buf2=copyargs.tgt; } res = NBC_Copy(buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype, handle->comm); if(res != NBC_OK) { printf("NBC_Copy() failed (code: %i)\n", res); ret=res; goto error; } break; case UNPACK: NBC_DEBUG(5, " UNPACK (offset %li) ", (long)ptr-(long)myschedule); NBC_GET_BYTES(ptr,unpackargs); NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %lu, *tgt: %lu\n", (unsigned long)unpackargs.inbuf, unpackargs.count, (unsigned long)unpackargs.datatype, (unsigned long)unpackargs.outbuf); /* get buffers */ if(unpackargs.tmpinbuf) { buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf; } else { buf1=unpackargs.outbuf; } if(unpackargs.tmpoutbuf) { buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf; } else { buf2=unpackargs.outbuf; } res = NBC_Unpack(buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm); if(res != NBC_OK) { printf("NBC_Unpack() failed (code: %i)\n", res); ret=res; goto error; } break; default: printf("NBC_Start_round: bad type %li at offset %li\n", (long)type, (long)ptr-(long)myschedule); ret=NBC_BAD_SCHED; goto error; } } /* check if we can make progress - not in the first round, this allows us to leave the * initialization faster and to reach more overlap * * threaded case: calling progress in the first round can lead to a * deadlock if NBC_Free is called in this round :-( */ if(handle->row_offset != sizeof(int)) { res = NBC_Progress(handle); if((NBC_OK != res) && (NBC_CONTINUE != res)) { printf("Error in NBC_Progress() (%i)\n", res); ret=res; goto error; } } error: return ret; }
int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, ompi_coll_libnbc_module_t *module, bool persistent, ompi_request_t **request, void *tmpbuf) { int ret, tmp_tag; bool need_register = false; ompi_coll_libnbc_request_t *handle; /* no operation (e.g. one process barrier)? */ if (((int *)schedule->data)[0] == 0 && schedule->data[sizeof(int)] == 0) { ret = nbc_get_noop_request(persistent, request); if (OMPI_SUCCESS != ret) { return OMPI_ERR_OUT_OF_RESOURCE; } /* update the module->tag here because other processes may have operations * and they may update the module->tag */ OPAL_THREAD_LOCK(&module->mutex); tmp_tag = module->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } OPAL_THREAD_UNLOCK(&module->mutex); OBJ_RELEASE(schedule); free(tmpbuf); return OMPI_SUCCESS; } OMPI_COLL_LIBNBC_REQUEST_ALLOC(comm, persistent, handle); if (NULL == handle) return OMPI_ERR_OUT_OF_RESOURCE; handle->tmpbuf = NULL; handle->req_count = 0; handle->req_array = NULL; handle->comm = comm; handle->schedule = NULL; handle->row_offset = 0; handle->nbc_complete = persistent ? true : false; /******************** Do the tag and shadow comm administration ... ***************/ OPAL_THREAD_LOCK(&module->mutex); tmp_tag = module->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } if (true != module->comm_registered) { module->comm_registered = true; need_register = true; } OPAL_THREAD_UNLOCK(&module->mutex); handle->tag = tmp_tag; /* register progress */ if (need_register) { int32_t tmp = OPAL_THREAD_ADD_FETCH32(&mca_coll_libnbc_component.active_comms, 1); if (tmp == 1) { opal_progress_register(ompi_coll_libnbc_progress); } } handle->comm=comm; /*printf("got module: %lu tag: %i\n", module, module->tag);*/ /******************** end of tag and shadow comm administration ... ***************/ handle->comminfo = module; NBC_DEBUG(3, "got tag %i\n", handle->tag); handle->tmpbuf = tmpbuf; handle->schedule = schedule; *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }