static int32_t
opal_datatype_optimize_short( opal_datatype_t* pData,
                         int32_t count,
                         dt_type_desc_t* pTypeDesc )
{
    dt_elem_desc_t* pElemDesc;
    ddt_elem_desc_t opt_elem;
    dt_stack_t* pStack;            /* pointer to the position on the stack */
    int32_t pos_desc = 0;          /* actual position in the description of the derived datatype */
    int32_t stack_pos = 0, last_type = OPAL_DATATYPE_UINT1, last_length = 0;
    int32_t type = OPAL_DATATYPE_LOOP, nbElems = 0, continuity;
    OPAL_PTRDIFF_TYPE total_disp = 0, last_extent = 1, last_disp = 0;
    uint16_t last_flags = 0xFFFF;  /* keep all for the first datatype */
    uint32_t i;

    pStack = (dt_stack_t*)alloca( sizeof(dt_stack_t) * (pData->btypes[OPAL_DATATYPE_LOOP]+2) );
    SAVE_STACK( pStack, -1, 0, count, 0 );

    pTypeDesc->length = 2 * pData->desc.used + 1 /* for the fake OPAL_DATATYPE_END_LOOP at the end */;
    pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length );
    pTypeDesc->used = 0;

    SET_EMPTY_ELEMENT( &opt_elem );
    assert( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pData->desc.used].elem.common.type );
    opt_elem.common.type = OPAL_DATATYPE_LOOP;
    opt_elem.common.flags = 0xFFFF;  /* keep all for the first datatype */
    opt_elem.count = 0;
    opt_elem.disp = pData->desc.desc[pData->desc.used].end_loop.first_elem_disp;
    opt_elem.extent = 0;

    while( stack_pos >= 0 ) {
        if( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */
            ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop);
            if( last_length != 0 ) {
                CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent );
                pElemDesc++; nbElems++;
                last_disp += last_length;
                last_length = 0;
            }
            CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1,  /* # of elems in this loop */
                             end_loop->first_elem_disp, end_loop->size, end_loop->common.flags );
            pElemDesc++; nbElems++;
            if( --stack_pos >= 0 ) {  /* still something to do ? */
                ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop);
                pStartLoop->items = (pElemDesc - 1)->elem.count;
                total_disp = pStack->disp;  /* update the displacement position */
            }
            pStack--;  /* go down one position on the stack */
            pos_desc++;
            continue;
        }
        if( OPAL_DATATYPE_LOOP == pData->desc.desc[pos_desc].elem.common.type ) {
            ddt_loop_desc_t* loop = (ddt_loop_desc_t*)&(pData->desc.desc[pos_desc]);
            ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]);
            int index = GET_FIRST_NON_LOOP( &(pData->desc.desc[pos_desc]) );
            OPAL_PTRDIFF_TYPE loop_disp = pData->desc.desc[pos_desc + index].elem.disp;

            continuity = ((last_disp + last_length * (OPAL_PTRDIFF_TYPE)opal_datatype_basicDatatypes[last_type]->size)
                              == (total_disp + loop_disp));
            if( loop->common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                /* the loop is contiguous or composed by contiguous elements with a gap */
                if( loop->extent == (OPAL_PTRDIFF_TYPE)end_loop->size ) {
                    /* the whole loop is contiguous */
                    if( !continuity ) {
                        if( 0 != last_length ) {
                            CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC,
                                         last_length, last_disp, last_extent );
                            pElemDesc++; nbElems++;
                            last_length = 0;
                        }
                        last_disp = total_disp + loop_disp;
                    }
                    last_length = (last_length * opal_datatype_basicDatatypes[last_type]->size
                                   + loop->loops * end_loop->size);
                    last_type   = OPAL_DATATYPE_UINT1;
                    last_extent = 1;
                } else {
                    int counter = loop->loops;
                    OPAL_PTRDIFF_TYPE merged_disp = 0;
                    /* if the previous data is contiguous with this piece and it has a length not ZERO */
                    if( last_length != 0 ) {
                        if( continuity ) {
                            last_length *= opal_datatype_basicDatatypes[last_type]->size;
                            last_length += end_loop->size;
                            last_type    = OPAL_DATATYPE_UINT1;
                            last_extent  = 1;
                            counter--;
                            merged_disp = loop->extent;  /* merged loop, update the disp of the remaining elems */
                        }
                        CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC,
                                     last_length, last_disp, last_extent );
                        pElemDesc++; nbElems++;
                        last_disp += last_length;
                        last_length = 0;
                        last_type = OPAL_DATATYPE_LOOP;
                    }
                    /**
                     * The content of the loop is contiguous (maybe with a gap before or after).
                     *
                     * If any of the loops have been merged with the previous element, then the
                     * displacement of the first element (or the displacement of all elements if the
                     * loop will be removed) must be updated accordingly.
                     */
                    if( counter <= 2 ) {
                        merged_disp += end_loop->first_elem_disp;
                        while( counter > 0 ) {
                            CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC,
                                         end_loop->size, merged_disp, 1);
                            pElemDesc++; nbElems++; counter--;
                            merged_disp += loop->extent;
                        }
                    } else {
                        CREATE_LOOP_START( pElemDesc, counter, 2, loop->extent, loop->common.flags );
                        pElemDesc++; nbElems++;
                        CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC,
                                     end_loop->size, loop_disp, 1);
                        pElemDesc++; nbElems++;
                        CREATE_LOOP_END( pElemDesc, 2, end_loop->first_elem_disp + merged_disp,
                                         end_loop->size, end_loop->common.flags );
                        pElemDesc++; nbElems++;
                    }
                }
                pos_desc += loop->items + 1;
            } else {
                ddt_elem_desc_t* elem = (ddt_elem_desc_t*)&(pData->desc.desc[pos_desc+1]);
                if( last_length != 0 ) {
                    CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent );
                    pElemDesc++; nbElems++;
                    last_disp  += last_length;
                    last_length = 0;
                    last_type   = OPAL_DATATYPE_LOOP;
                }
                if( 2 == loop->items ) { /* small loop */
                    if( (1 == elem->count)
                        && (elem->extent == (OPAL_PTRDIFF_TYPE)opal_datatype_basicDatatypes[elem->common.type]->size) ) {
                        CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags & ~OPAL_DATATYPE_FLAG_CONTIGUOUS,
                                     loop->loops, elem->disp, loop->extent );
                        pElemDesc++; nbElems++;
                        pos_desc += loop->items + 1;
                        goto complete_loop;
                    } else if( loop->loops < 3 ) {
                        OPAL_PTRDIFF_TYPE elem_displ = elem->disp;
                        for( i = 0; i < loop->loops; i++ ) {
                            CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags,
                                         elem->count, elem_displ, elem->extent );
                            elem_displ += loop->extent;
                            pElemDesc++; nbElems++;
                        }
                        pos_desc += loop->items + 1;
                        goto complete_loop;
                    }
                }
                CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags );
                pElemDesc++; nbElems++;
                PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp );
                pos_desc++;
                DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" );
            }
        complete_loop:
            total_disp = pStack->disp;  /* update the displacement */
            continue;
        }
        while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {  /* keep doing it until we reach a non datatype element */
            /* now here we have a basic datatype */
            type = pData->desc.desc[pos_desc].elem.common.type;
            continuity = ((last_disp + last_length * (OPAL_PTRDIFF_TYPE)opal_datatype_basicDatatypes[last_type]->size)
                          == (total_disp + pData->desc.desc[pos_desc].elem.disp));

            if( (pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && continuity &&
                (pData->desc.desc[pos_desc].elem.extent == (int32_t)opal_datatype_basicDatatypes[type]->size) ) {
                if( type == last_type ) {
                    last_length += pData->desc.desc[pos_desc].elem.count;
                    last_extent = pData->desc.desc[pos_desc].elem.extent;
                } else {
                    if( last_length == 0 ) {
                        last_type = type;
                        last_length = pData->desc.desc[pos_desc].elem.count;
                        last_extent = pData->desc.desc[pos_desc].elem.extent;
                    } else {
                        last_length = last_length * opal_datatype_basicDatatypes[last_type]->size +
                            pData->desc.desc[pos_desc].elem.count * opal_datatype_basicDatatypes[type]->size;
                        last_type = OPAL_DATATYPE_UINT1;
                        last_extent = 1;
                    }
                }
                last_flags &= pData->desc.desc[pos_desc].elem.common.flags;
            } else {
                if( last_length != 0 ) {
                    CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent );
                    pElemDesc++; nbElems++;
                }
                last_disp = total_disp + pData->desc.desc[pos_desc].elem.disp;
                last_length = pData->desc.desc[pos_desc].elem.count;
                last_extent = pData->desc.desc[pos_desc].elem.extent;
                last_type = type;
            }
            pos_desc++;  /* advance to the next data */
        }
    }

    if( last_length != 0 ) {
        CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent );
        pElemDesc++; nbElems++;
    }
    /* cleanup the stack */
    pTypeDesc->used = nbElems - 1;  /* except the last fake END_LOOP */
    return OPAL_SUCCESS;
}
/* we have 3 differents structures to update:
 * the first is the real representation of the datatype
 * the second is the internal representation using extents
 * the last is the representation used for send operations
 * If the count is ZERO we dont have to add the pdtAdd datatype. But we have to
 * be sure that the pdtBase datatype is correctly initialized with all fields
 * set to ZERO if it's a empty datatype.
 */
int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtAdd,
                           uint32_t count, OPAL_PTRDIFF_TYPE disp, OPAL_PTRDIFF_TYPE extent )
{
    uint32_t newLength, place_needed = 0, i;
    short localFlags = 0;  /* no specific options yet */
    dt_elem_desc_t *pLast, *pLoop = NULL;
    OPAL_PTRDIFF_TYPE lb, ub, true_lb, true_ub, epsilon, old_true_ub;

    /* the extent should always be positive. So a negative
     * value here have a special meaning ie. default extent as
     * computed by ub - lb
     */
    if( extent == -1 ) extent = (pdtAdd->ub - pdtAdd->lb);

    /* Deal with the special markers (OPAL_DATATYPE_LB and OPAL_DATATYPE_UB) */
    if( OPAL_DATATYPE_LB == pdtAdd->id ) {
        pdtBase->bdt_used |= (((uint32_t)1) << OPAL_DATATYPE_LB);
        if( pdtBase->flags & OPAL_DATATYPE_FLAG_USER_LB ) {
            pdtBase->lb = LMIN( pdtBase->lb, disp );
        } else {
            pdtBase->lb = disp;
            pdtBase->flags |= OPAL_DATATYPE_FLAG_USER_LB;
        }
        if( (pdtBase->ub - pdtBase->lb) != (OPAL_PTRDIFF_TYPE)pdtBase->size ) {
            pdtBase->flags &= ~OPAL_DATATYPE_FLAG_NO_GAPS;
        }
        return OPAL_SUCCESS; /* Just ignore the OPAL_DATATYPE_LOOP and OPAL_DATATYPE_END_LOOP */
    } else if( OPAL_DATATYPE_UB == pdtAdd->id ) {
        pdtBase->bdt_used |= (((uint32_t)1) << OPAL_DATATYPE_UB);
        if( pdtBase->flags & OPAL_DATATYPE_FLAG_USER_UB ) {
            pdtBase->ub = LMAX( pdtBase->ub, disp );
        } else {
            pdtBase->ub = disp;
            pdtBase->flags |= OPAL_DATATYPE_FLAG_USER_UB;
        }
        if( (pdtBase->ub - pdtBase->lb) != (OPAL_PTRDIFF_TYPE)pdtBase->size ) {
            pdtBase->flags &= ~OPAL_DATATYPE_FLAG_NO_GAPS;
        }
        return OPAL_SUCCESS; /* Just ignore the OPAL_DATATYPE_LOOP and OPAL_DATATYPE_END_LOOP */
    }

    /* Compute the number of entries we need in the datatype description */
    OPAL_DATATYPE_COMPUTE_REQUIRED_ENTRIES( pdtAdd, count, extent, place_needed );

    /*
     * Compute the lower and upper bound of the datatype. We do it in 2 steps.
     * First compute the lb and ub of the new datatype taking in account the
     * count. Then update the lb value depending on the user markers and
     * update the global lb and ub.
     */
    OPAL_DATATYPE_LB_UB_CONT( count, disp, pdtAdd->lb, pdtAdd->ub, extent, lb, ub );

    /* Compute the true_lb and true_ub for the datatype to be added, taking
     * in account the number of repetions. These values do not include the
     * potential gaps at the begining and at the end of the datatype.
     */
    true_lb = lb - (pdtAdd->lb - pdtAdd->true_lb);
    true_ub = ub - (pdtAdd->ub - pdtAdd->true_ub);
    if( true_lb > true_ub ) {
        old_true_ub = true_lb;
        true_lb = true_ub;
        true_ub = old_true_ub;
    }

#if 0
    /* Avoid claiming overlap as much as possible. */
    if( !(pdtBase->flags & OPAL_DATATYPE_FLAG_OVERLAP) ) {
        if( ((disp + true_lb) >= pdtBase->true_ub) ||
                ((disp + true_ub) <= pdtBase->true_lb) ) {
        } else {
            /* potential overlap */
        }
    }
#endif

    /* The lower bound should be inherited from the parent if and only
     * if the USER has explicitly set it. The result lb is the MIN between
     * the all lb + disp if and only if all or nobody flags's contain the LB.
     */
    if( (pdtAdd->flags ^ pdtBase->flags) & OPAL_DATATYPE_FLAG_USER_LB ) {
        if( pdtBase->flags & OPAL_DATATYPE_FLAG_USER_LB ) {
            lb = pdtBase->lb;  /* base type has a user provided lb */
        }
        pdtBase->flags |= OPAL_DATATYPE_FLAG_USER_LB;
    } else {
        /* both of them have the LB flag or both of them dont have it */
        lb = LMIN( pdtBase->lb, lb );
    }

    /* the same apply for the upper bound except for the case where
     * either of them has the flag UB, in which case we should
     * compute the UB including the natural alignement of the data.
     */
    if( (pdtBase->flags ^ pdtAdd->flags) & OPAL_DATATYPE_FLAG_USER_UB ) {
        if( pdtBase->flags & OPAL_DATATYPE_FLAG_USER_UB ) {
            ub = pdtBase->ub;
        }
        pdtBase->flags |= OPAL_DATATYPE_FLAG_USER_UB;
    } else {
        /* both of them have the UB flag or both of them dont have it */
        /* we should compute the extent depending on the alignement */
        ub = LMAX( pdtBase->ub, ub );
    }
    /* While the true_lb and true_ub have to be ordered to have the true_lb lower
     * than the true_ub, the ub and lb do not have to be ordered. They should be
     * as the user define them.
     */
    pdtBase->lb = lb;
    pdtBase->ub = ub;

    /* compute the new memory alignement */
    pdtBase->align = IMAX( pdtBase->align, pdtAdd->align );

    /* Now that we have the new ub and the alignment we should update the ub to match
     * the new alignement. We have to add an epsilon that is the least nonnegative
     * increment needed to roung the extent to the next multiple of the alignment.
     * This rule apply only if there is user specified upper bound as stated in the
     * MPI standard MPI 1.2 page 71.
     */
    if( !(pdtBase->flags & OPAL_DATATYPE_FLAG_USER_UB) ) {
        epsilon = (pdtBase->ub - pdtBase->lb) % pdtBase->align;
        if( 0 != epsilon ) {
            pdtBase->ub += (pdtBase->align - epsilon);
        }
    }
    /* now we know it contain some data */
    pdtBase->flags |= OPAL_DATATYPE_FLAG_DATA;

    /*
     * the count == 0 is LEGAL only for MPI_UB and MPI_LB. Therefore we support it
     * here in the upper part of this function. As an extension, the count set to
     * zero can be used to reset the alignment of the data, but not for changing
     * the true_lb and true_ub.
     */
    if( (0 == count) || (0 == pdtAdd->size) ) {
        return OPAL_SUCCESS;
    }

    /* Now, once we know everything is fine and there are some bytes in
     * the data-type we can update the size, true_lb and true_ub.
     */
    pdtBase->size += count * pdtAdd->size;
    if( 0 == pdtBase->nbElems ) old_true_ub = disp;
    else                        old_true_ub = pdtBase->true_ub;
    if( 0 != pdtBase->size ) {
        pdtBase->true_lb = LMIN( true_lb, pdtBase->true_lb );
        pdtBase->true_ub = LMAX( true_ub, pdtBase->true_ub );
    } else {
        pdtBase->true_lb = true_lb;
        pdtBase->true_ub = true_ub;
    }

    pdtBase->bdt_used |= pdtAdd->bdt_used;
    newLength = pdtBase->desc.used + place_needed;
    if( newLength > pdtBase->desc.length ) {
        newLength = ((newLength / DT_INCREASE_STACK) + 1 ) * DT_INCREASE_STACK;
        pdtBase->desc.desc   = (dt_elem_desc_t*)realloc( pdtBase->desc.desc,
                               sizeof(dt_elem_desc_t) * newLength );
        pdtBase->desc.length = newLength;
    }
    pLast = &(pdtBase->desc.desc[pdtBase->desc.used]);
    /* The condition to be able to use the optimized path here is to be in presence
     * of an predefined contiguous datatype. This part is unable to handle any
     * predefined non contiguous datatypes (like MPI_SHORT_INT).
     */
    if( (pdtAdd->flags & (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA)) == (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA) ) {
        pdtBase->btypes[pdtAdd->id] += count;
        if( (extent != (OPAL_PTRDIFF_TYPE)pdtAdd->size) && (count > 1) ) {  /* gaps around the datatype */
            localFlags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITED | OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS);
            CREATE_LOOP_START( pLast, count, 2, extent, localFlags );
            pLast++;
            pLast->elem.common.type  = pdtAdd->id;
            pLast->elem.count        = 1;
            pLast->elem.disp         = disp;
            pLast->elem.extent       = pdtAdd->size;
            pLast->elem.common.flags = localFlags | OPAL_DATATYPE_FLAG_CONTIGUOUS;
            pLast++;
            CREATE_LOOP_END( pLast, 2, disp, pdtAdd->size, localFlags );
            pdtBase->desc.used += 3;
            pdtBase->btypes[OPAL_DATATYPE_LOOP]     = 1;
            pdtBase->btypes[OPAL_DATATYPE_END_LOOP] = 1;
        } else {
            pLast->elem.common.type = pdtAdd->id;
            pLast->elem.count       = count;
            pLast->elem.disp        = disp;
            pLast->elem.extent      = extent;
            pdtBase->desc.used++;
            pLast->elem.common.flags  = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITED);
        }
    } else {
        /* keep trace of the total number of basic datatypes in the datatype definition */
        pdtBase->btypes[OPAL_DATATYPE_LOOP]     += pdtAdd->btypes[OPAL_DATATYPE_LOOP];
        pdtBase->btypes[OPAL_DATATYPE_END_LOOP] += pdtAdd->btypes[OPAL_DATATYPE_END_LOOP];
        pdtBase->btypes[OPAL_DATATYPE_LB]       |= pdtAdd->btypes[OPAL_DATATYPE_LB];
        pdtBase->btypes[OPAL_DATATYPE_UB]       |= pdtAdd->btypes[OPAL_DATATYPE_UB];
        for( i = 4; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ )
            if( pdtAdd->btypes[i] != 0 ) pdtBase->btypes[i] += (count * pdtAdd->btypes[i]);

        if( (1 == pdtAdd->desc.used) && (extent == (pdtAdd->ub - pdtAdd->lb)) &&
                (extent == pdtAdd->desc.desc[0].elem.extent) ) {
            pLast->elem        = pdtAdd->desc.desc[0].elem;
            pLast->elem.count *= count;
            pLast->elem.disp  += disp;
            pdtBase->desc.used++;
        } else {
            /* if the extent of the datatype is the same as the extent of the loop
             * description of the datatype then we simply have to update the main loop.
             */
            if( count != 1 ) {
                pLoop = pLast;
                CREATE_LOOP_START( pLast, count, pdtAdd->desc.used + 1, extent,
                                   (pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITED)) );
                pdtBase->btypes[OPAL_DATATYPE_LOOP] += 2;
                pdtBase->desc.used += 2;
                pLast++;
            }

            for( i = 0; i < pdtAdd->desc.used; i++ ) {
                pLast->elem               = pdtAdd->desc.desc[i].elem;
                if( OPAL_DATATYPE_FLAG_DATA & pLast->elem.common.flags )
                    pLast->elem.disp += disp;
                else if( OPAL_DATATYPE_END_LOOP == pLast->elem.common.type ) {
                    pLast->end_loop.first_elem_disp += disp;
                }
                pLast++;
            }
            pdtBase->desc.used += pdtAdd->desc.used;
            if( pLoop != NULL ) {
                int index = GET_FIRST_NON_LOOP( pLoop );
                assert( pLoop[index].elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
                CREATE_LOOP_END( pLast, pdtAdd->desc.used + 1, pLoop[index].elem.disp,
                                 pdtAdd->size, pLoop->loop.common.flags );
            }
        }
        /* should I add some space until the extent of this datatype ? */
    }

    /* Is the data still contiguous ?
     * The only way for the data to be contiguous is to have the true extent
     * equal to his size. In other words to avoid having internal gaps between
     * elements. If any of the data are overlapping then this method will not work.
     */
    localFlags = pdtBase->flags & pdtAdd->flags;
    UNSET_CONTIGUOUS_FLAG(pdtBase->flags);
    if( (localFlags & OPAL_DATATYPE_FLAG_CONTIGUOUS)             /* both type were contiguous */
            && ((disp + pdtAdd->true_lb) == old_true_ub)  /* and there is no gap between them */
            && ( ((OPAL_PTRDIFF_TYPE)pdtAdd->size == extent)      /* the size and the extent of the
                                                       * added type have to match */
                 || (count < 2)) ) {                      /* if the count is bigger than 2 */
        SET_CONTIGUOUS_FLAG(pdtBase->flags);
        if( (OPAL_PTRDIFF_TYPE)pdtBase->size == (pdtBase->ub - pdtBase->lb) )
            SET_NO_GAP_FLAG(pdtBase->flags);
    }

    /* If the NO_GAP flag is set the contiguous have to be set too */
    if( pdtBase->flags & OPAL_DATATYPE_FLAG_NO_GAPS ) {
        assert( pdtBase->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS );
    }
    pdtBase->nbElems += (count * pdtAdd->nbElems);

    return OPAL_SUCCESS;
}
예제 #3
0
/* we have 3 differents structures to update:
 * the first is the real representation of the datatype
 * the second is the internal representation using extents
 * the last is the representation used for send operations
 * If the count is ZERO we dont have to add the pdtAdd datatype. But we have to
 * be sure that the pdtBase datatype is correctly initialized with all fields
 * set to ZERO if it's a empty datatype.
 */
int32_t ompi_ddt_add( ompi_datatype_t* pdtBase, const ompi_datatype_t* pdtAdd,
                      uint32_t count, ptrdiff_t disp, ptrdiff_t extent )
{
    uint32_t newLength, place_needed = 0, i;
    short localFlags = 0;  /* no specific options yet */
    dt_elem_desc_t *pLast, *pLoop = NULL;
    ptrdiff_t lb, ub, true_lb, true_ub, epsilon, old_true_ub;

    /* the extent should always be positive. So a negative
     * value here have a special meaning ie. default extent as
     * computed by ub - lb
     */
    if( extent == -1 ) extent = (pdtAdd->ub - pdtAdd->lb);

    if( pdtAdd->flags & DT_FLAG_PREDEFINED ) { /* add a basic datatype */
        /* handle special cases for DT_LB and DT_UB */
        if( pdtAdd == ompi_ddt_basicDatatypes[DT_LB] ) {
            pdtBase->bdt_used |= (((uint64_t)1) << DT_LB);
            if( pdtBase->flags & DT_FLAG_USER_LB ) {
                pdtBase->lb = LMIN( pdtBase->lb, disp );
            } else {
                pdtBase->lb = disp;
                pdtBase->flags |= DT_FLAG_USER_LB;
            }
            if( (pdtBase->ub - pdtBase->lb) != (ptrdiff_t)pdtBase->size ) {
                pdtBase->flags &= ~DT_FLAG_NO_GAPS;
            }
            return OMPI_SUCCESS;
        } else if( pdtAdd == ompi_ddt_basicDatatypes[DT_UB] ) {
            pdtBase->bdt_used |= (((uint64_t)1) << DT_UB);
            if( pdtBase->flags & DT_FLAG_USER_UB ) {
                pdtBase->ub = LMAX( pdtBase->ub, disp );
            } else {
                pdtBase->ub = disp;
                pdtBase->flags |= DT_FLAG_USER_UB;
            }
            if( (pdtBase->ub - pdtBase->lb) != (ptrdiff_t)pdtBase->size ) {
                pdtBase->flags &= ~DT_FLAG_NO_GAPS;
            }
            return OMPI_SUCCESS;
        }
        place_needed = (extent == (ptrdiff_t)pdtAdd->size ? 1 : 3);
    } else {
        place_needed = pdtAdd->desc.used;
        if( count != 1 ) place_needed += 2;  /* for the loop markers */
    }

    /*
     * Compute the lower and upper bound of the datatype. We do it in 2 steps.
     * First compute the lb and ub of the new datatype taking in account the
     * count. Then update the lb value depending on the user markers and
     * update the global lb and ub.
     */
    OMPI_DDT_LB_UB_CONT( count, disp, pdtAdd->lb, pdtAdd->ub, extent, lb, ub );
    /* The true_lb and true_ub take in account the gaps at the begining and the
     * end of the datatype independing on the number of repetitions of the datatype.
     */
    true_lb = lb - (pdtAdd->lb - pdtAdd->true_lb);
    true_ub = ub - (pdtAdd->ub - pdtAdd->true_ub);
    if( true_lb > true_ub ) {
        old_true_ub = true_lb;
        true_lb = true_ub;
        true_ub = old_true_ub;
    }

    /* the lower bound should be inherited from the parent if and only
     * if the USER has explicitly set it. The result lb is the MIN between
     * the all lb + disp if and only if all or nobody flags's contain the LB.
     */
    if( (pdtAdd->flags ^ pdtBase->flags) & DT_FLAG_USER_LB ) {
        if( pdtBase->flags & DT_FLAG_USER_LB ) {
            lb = pdtBase->lb;  /* base type has a user provided lb */
        }
        pdtBase->flags |= DT_FLAG_USER_LB;
    } else {
        /* both of them have the LB flag or both of them dont have it */
        lb = LMIN( pdtBase->lb, lb );
    }

    /* the same apply for the upper bound except for the case where
     * either of them has the flag UB, in which case we should
     * compute the UB including the natural alignement of the data.
     */
    if( (pdtBase->flags ^ pdtAdd->flags) & DT_FLAG_USER_UB ) {
        if( pdtBase->flags & DT_FLAG_USER_UB ) {
            ub = pdtBase->ub;
        }
        pdtBase->flags |= DT_FLAG_USER_UB;
    } else {
        /* both of them have the UB flag or both of them dont have it */
        /* we should compute the extent depending on the alignement */
        ub = LMAX( pdtBase->ub, ub );
    }
    /* While the true_lb and true_ub have to be ordered to have the true_lb lower
     * than the true_ub, the ub and lb does not have to be ordered. They should be
     * as the user define them.
     */
    pdtBase->lb = lb;
    pdtBase->ub = ub;

    if( 0 == pdtBase->nbElems ) old_true_ub = disp;
    else                        old_true_ub = pdtBase->true_ub;
    pdtBase->true_lb = LMIN( true_lb, pdtBase->true_lb );
    pdtBase->true_ub = LMAX( true_ub, pdtBase->true_ub );

    /* compute the new memory alignement */
    pdtBase->align = IMAX( pdtBase->align, pdtAdd->align );
    pdtBase->size += count * pdtAdd->size;

    /* Now that we have the new ub and the alignment we should update the ub to match
     * the new alignement. We have to add an epsilon that is the least nonnegative increment
     * needed to roung the extent to the next multiple of the alignment. This rule
     * apply only if there is user specified upper bound as stated in the MPI
     * standard MPI 1.2 page 71.
     */
    if( !(pdtBase->flags & DT_FLAG_USER_UB) ) {
        epsilon = (pdtBase->ub - pdtBase->lb) % pdtBase->align;
        if( 0 != epsilon ) {
            pdtBase->ub += (pdtBase->align - epsilon);
        }
    }

    /*
     * the count == 0 is LEGAL only for MPI_UB and MPI_LB. I accept it just as a nice way to set
     * the soft UB for a data (without using a real UB marker). This approach can be used to
     * create the subarray and darray datatype. However from the MPI level this function
     * should never be called directly with a count set to 0.
     * Adding a data-type with a size zero is legal but does not have to go through all the
     * stuff below.
     */
    if( (0 == count) || (0 == pdtAdd->size) ) {
        return OMPI_SUCCESS;
    }

    pdtBase->bdt_used |= pdtAdd->bdt_used;
    newLength = pdtBase->desc.used + place_needed;
    if( newLength > pdtBase->desc.length ) {
        newLength = ((newLength / DT_INCREASE_STACK) + 1 ) * DT_INCREASE_STACK;
        pdtBase->desc.desc   = (dt_elem_desc_t*)realloc( pdtBase->desc.desc,
                                                         sizeof(dt_elem_desc_t) * newLength );
        pdtBase->desc.length = newLength;
    }
    pLast = &(pdtBase->desc.desc[pdtBase->desc.used]);
    /* The condition to be able to use the optimized path here is to be in presence
     * of an predefined contiguous datatype. This part is unable to handle any
     * predefined non contiguous datatypes (like MPI_SHORT_INT).
     */
    if( (pdtAdd->flags & (DT_FLAG_PREDEFINED | DT_FLAG_DATA)) == (DT_FLAG_PREDEFINED | DT_FLAG_DATA) ) {
        pdtBase->btypes[pdtAdd->id] += count;
        if( (extent != (ptrdiff_t)pdtAdd->size) && (count > 1) ) {  /* gaps around the datatype */
            localFlags = pdtAdd->flags & ~(DT_FLAG_COMMITED | DT_FLAG_CONTIGUOUS | DT_FLAG_NO_GAPS);
            CREATE_LOOP_START( pLast, count, 2, extent, localFlags );
            pLast++;
            pLast->elem.common.type  = pdtAdd->id;
            pLast->elem.count        = 1;
            pLast->elem.disp         = disp;
            pLast->elem.extent       = pdtAdd->size;
            pLast->elem.common.flags = localFlags | DT_FLAG_CONTIGUOUS;
            pLast++;
            CREATE_LOOP_END( pLast, 2, disp, pdtAdd->size, localFlags );
            pdtBase->desc.used += 3;
            pdtBase->btypes[DT_LOOP]     = 1;
            pdtBase->btypes[DT_END_LOOP] = 1;
        } else {
            pLast->elem.common.type = pdtAdd->id;
            pLast->elem.count       = count;
            pLast->elem.disp        = disp;
            pLast->elem.extent      = extent;
            pdtBase->desc.used++;
            pLast->elem.common.flags  = pdtAdd->flags & ~(DT_FLAG_COMMITED);
        }
    } else {
        /* keep trace of the total number of basic datatypes in the datatype definition */
        pdtBase->btypes[DT_LOOP]     += pdtAdd->btypes[DT_LOOP];
        pdtBase->btypes[DT_END_LOOP] += pdtAdd->btypes[DT_END_LOOP];
        pdtBase->btypes[DT_LB]       |= pdtAdd->btypes[DT_LB];
        pdtBase->btypes[DT_UB]       |= pdtAdd->btypes[DT_UB];
        for( i = 4; i < DT_MAX_PREDEFINED; i++ )
            if( pdtAdd->btypes[i] != 0 ) pdtBase->btypes[i] += (count * pdtAdd->btypes[i]);

        if( (1 == pdtAdd->desc.used) && (extent == (pdtAdd->ub - pdtAdd->lb)) &&
            (extent == pdtAdd->desc.desc[0].elem.extent) ){
            pLast->elem        = pdtAdd->desc.desc[0].elem;
            pLast->elem.count *= count;
            pLast->elem.disp  += disp;
            pdtBase->desc.used++;
        } else {
            /* if the extent of the datatype is the same as the extent of the loop
             * description of the datatype then we simply have to update the main loop.
             */
            if( count != 1 ) {
                pLoop = pLast;
                CREATE_LOOP_START( pLast, count, pdtAdd->desc.used + 1, extent,
                                   (pdtAdd->flags & ~(DT_FLAG_COMMITED)) );
                pdtBase->btypes[DT_LOOP] += 2;
                pdtBase->desc.used += 2;
                pLast++;
            }

            for( i = 0; i < pdtAdd->desc.used; i++ ) {
                pLast->elem               = pdtAdd->desc.desc[i].elem;
                if( DT_FLAG_DATA & pLast->elem.common.flags )
                    pLast->elem.disp += disp;
                else if( DT_END_LOOP == pLast->elem.common.type ) {
                    pLast->end_loop.first_elem_disp += disp;
                }
                pLast++;
            }
            pdtBase->desc.used += pdtAdd->desc.used;
            if( pLoop != NULL ) {
                int index = GET_FIRST_NON_LOOP( pLoop );
                assert( pLoop[index].elem.common.flags & DT_FLAG_DATA );
                CREATE_LOOP_END( pLast, pdtAdd->desc.used + 1, pLoop[index].elem.disp,
                                 pdtAdd->size, pLoop->loop.common.flags );
            }
        }
        /* should I add some space until the extent of this datatype ? */
    }

    /* Is the data still contiguous ?
     * The only way for the data to be contiguous is to have the true extent
     * equal to his size. In other words to avoid having internal gaps between
     * elements. If any of the data are overlapping then this method will not work.
     */
    localFlags = pdtBase->flags & pdtAdd->flags;
    UNSET_CONTIGUOUS_FLAG(pdtBase->flags);
    if( disp != old_true_ub ) { /* is there a gap between the 2 datatypes ? */
        if( disp < old_true_ub ) pdtBase->flags |= DT_FLAG_OVERLAP;
    } else {
        if( (localFlags & DT_FLAG_CONTIGUOUS)        /* both have to be contiguous */
            && ( ((ptrdiff_t)pdtAdd->size == extent) /* the size and the extent of the
                                                      * added type have to match */
                 || (count < 2)) ) {                 /* if the count is bigger than 2 */
            SET_CONTIGUOUS_FLAG(pdtBase->flags);
            if( (ptrdiff_t)pdtBase->size == (pdtBase->ub - pdtBase->lb) )
                SET_NO_GAP_FLAG(pdtBase->flags);
        }
    }
    /* If the NO_GAP flag is set the contiguous have to be set too */
    if( pdtBase->flags & DT_FLAG_NO_GAPS ) {
        assert( pdtBase->flags & DT_FLAG_CONTIGUOUS );
    }
    pdtBase->nbElems += (count * pdtAdd->nbElems);

    return OMPI_SUCCESS;
}