static int mca_coll_ml_allgather_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op) { bool rcontig = coll_op->full_message.recv_data_continguous; int n_ranks_in_comm = ompi_comm_size(OP_ML_MODULE(coll_op)->comm); void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr + (uintptr_t)coll_op->full_message.n_bytes_delivered); void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + (size_t)coll_op->variable_fn_params.rbuf_offset); if (rcontig) { memcpy(dest, src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled); } else { mca_coll_ml_convertor_unpack(src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled, &coll_op->fragment_data.message_descriptor->recv_convertor); } return OMPI_SUCCESS; }
static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) { /* local variables */ int ret; size_t frag_len, dt_size; const void *buf; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_collective_operation_progress_t *new_op; mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous; ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size); /* Keep the pipeline filled with fragments */ while (coll_op->fragment_data.message_descriptor->n_active < coll_op->fragment_data.message_descriptor->pipeline_depth) { /* If an active fragment happens to have completed the collective during * a hop into the progress engine, then don't launch a new fragment, * instead break and return. */ if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled == coll_op->fragment_data.message_descriptor->n_bytes_total) { break; } /* Get an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); if (NULL == src_buffer_desc) { /* If there exist outstanding fragments, then break out * and let an active fragment deal with this later, * there are no buffers available. */ if (0 < coll_op->fragment_data.message_descriptor->n_active) { return OMPI_SUCCESS; } else { /* The fragment is already on list and * the we still have no ml resources * Return busy */ if (coll_op->pending & REQ_OUT_OF_MEMORY) { ML_VERBOSE(10,("Out of resources %p", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } coll_op->pending |= REQ_OUT_OF_MEMORY; opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), (opal_list_item_t *)coll_op); ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } } /* Get a new collective descriptor and initialize it */ new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], coll_op->fragment_data.message_descriptor->src_user_addr, coll_op->fragment_data.message_descriptor->dest_user_addr, coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled); new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; /* set the task setup callback */ new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; /* MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, src_buffer_desc->buffer_index, src_buffer_desc); */ /* We need this address for pointer arithmetic in memcpy */ buf = coll_op->fragment_data.message_descriptor->src_user_addr; if (!scontig) { frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER]; mca_coll_ml_convertor_get_send_frag_size( ml_module, &frag_len, coll_op->fragment_data.message_descriptor); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index), frag_len, &coll_op->fragment_data.message_descriptor->send_convertor); } else { /* calculate new frag length, there are some issues here */ frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < coll_op->fragment_data.fragment_size ? coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled : coll_op->fragment_data.fragment_size); /* everybody copies in, based on the new values */ memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr + frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset + frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index), (void *) ((uintptr_t) buf + (uintptr_t) coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len); } new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* update the number of bytes scheduled */ new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; /* everyone needs an unpack function */ new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; new_op->fragment_data.fragment_size = frag_len; new_op->fragment_data.buffer_desc = src_buffer_desc; /* Setup fragment specific data */ ++(new_op->fragment_data.message_descriptor->n_active); ML_VERBOSE(10, ("Start more, My index %d ", new_op->fragment_data.buffer_desc->buffer_index)); /* this is a bit buggy */ ML_SET_VARIABLE_PARAMS_BCAST( new_op, OP_ML_MODULE(new_op), frag_len /* yes, we have consistent units, so this makes sense */, MPI_BYTE /* we fragment according to buffer size * we don't reduce the data thus we needn't * keep "whole" datatypes, we may freely * fragment without regard for multiples * of any specific datatype */, src_buffer_desc, 0, 0, frag_len, src_buffer_desc->data_addr); /* initialize first coll */ ret = new_op->sequential_routine.seq_task_setup(new_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(3, ("Fragment failed to initialize itself")); return ret; } new_op->variable_fn_params.buffer_size = frag_len; new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; new_op->variable_fn_params.root = 0; MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); /* append this collective !! */ OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *)new_op); OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); } return OMPI_SUCCESS; }
static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) { /* local variables */ void *buf; size_t dt_size; int ret, frag_len, count; ptrdiff_t lb, extent; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_collective_operation_progress_t *new_op; mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent); if (ret < 0) { return OMPI_ERROR; } dt_size = (size_t) extent; /* Keep the pipeline filled with fragments */ while (coll_op->fragment_data.message_descriptor->n_active < coll_op->fragment_data.message_descriptor->pipeline_depth) { /* If an active fragment happens to have completed the collective during * a hop into the progress engine, then don't launch a new fragment, * instead break and return. */ if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled == coll_op->fragment_data.message_descriptor->n_bytes_total) { break; } /* Get an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op)); if (NULL == src_buffer_desc) { /* If there exist outstanding fragments, then break out * and let an active fragment deal with this later, * there are no buffers available. */ if (0 < coll_op->fragment_data.message_descriptor->n_active) { return OMPI_SUCCESS; } else { /* It is useless to call progress from here, since * ml progress can't be executed as result ml memsync * call will not be completed and no memory will be * recycled. So we put the element on the list, and we will * progress it later when memsync will recycle some memory*/ /* The fragment is already on list and * the we still have no ml resources * Return busy */ if (coll_op->pending & REQ_OUT_OF_MEMORY) { ML_VERBOSE(10,("Out of resources %p", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } coll_op->pending |= REQ_OUT_OF_MEMORY; opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), (opal_list_item_t *)coll_op); ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } } /* Get a new collective descriptor and initialize it */ new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE], coll_op->fragment_data.message_descriptor->src_user_addr, coll_op->fragment_data.message_descriptor->dest_user_addr, coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled); ML_VERBOSE(1,(" In Reduce fragment progress %d %d ", coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, src_buffer_desc->buffer_index, src_buffer_desc); new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; /* set the task setup callback */ new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; /* We need this address for pointer arithmetic in memcpy */ buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr; /* calculate the number of data types in this packet */ count = (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )? (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size : (size_t) coll_op->variable_fn_params.count); /* calculate the fragment length */ frag_len = count * dt_size; ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count, (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t) coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); if (ret < 0) { return OMPI_ERROR; } /* if root unpack the data */ if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) { new_op->process_fn = mca_coll_ml_reduce_unpack; new_op->variable_fn_params.root_flag = true; } else { new_op->process_fn = NULL; new_op->variable_fn_params.root_flag = false; } new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route; /* Setup fragment specific data */ new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; new_op->fragment_data.buffer_desc = src_buffer_desc; new_op->fragment_data.fragment_size = frag_len; (new_op->fragment_data.message_descriptor->n_active)++; /* Set in Reduce Buffer arguments */ ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count, coll_op->variable_fn_params.dtype, src_buffer_desc, 0, (ml_module->payload_block->size_buffer - ml_module->data_offset)/2, frag_len, src_buffer_desc->data_addr); new_op->variable_fn_params.buffer_size = frag_len; new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; new_op->variable_fn_params.root = coll_op->variable_fn_params.root; new_op->global_root = coll_op->global_root; new_op->variable_fn_params.op = coll_op->variable_fn_params.op; new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n", new_op->variable_fn_params.buffer_size, new_op->fragment_data.fragment_size, new_op->fragment_data.message_descriptor->n_bytes_scheduled)); /* initialize first coll */ new_op->sequential_routine.seq_task_setup(new_op); /* append this collective !! */ OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *)new_op); OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); } return OMPI_SUCCESS; }