Пример #1
0
int Expander::expand_reduce1d(bh_ir& bhir, int pc, int thread_limit)
{
    static std::map<int,int> fold_map;
    int start_pc = pc;                              
    bh_instruction& instr = bhir.instr_list[pc];        // Grab the BH_POWER instruction
    
    bh_index elements = bh_nelements(instr.operand[1]);

    if (elements * 2 < thread_limit)
        return 0;
        
    int fold = 0;
    if (fold_map.find(elements) != fold_map.end())
    {    
        fold = fold_map.find(elements)->second;
    } else {
        fold = find_fold(elements,thread_limit);
        fold_map[elements] = fold;
    }
    if (fold < 2)
        return 0;
    
    bh_opcode opcode = instr.opcode;
    instr.opcode = BH_NONE;             // Lazy choice... no re-use just NOP it.
    bh_view out = instr.operand[0];     // Grab operands
    bh_view in = instr.operand[1];
    in.ndim = 2;
    in.shape[0] = fold; 
    in.shape[1] = elements/fold;
    in.stride[1] = in.stride[0];
    in.stride[0] = in.stride[0]*elements/fold;
    bh_view temp = make_temp(in.base->type, elements/fold);
    inject(bhir, ++pc, opcode, temp, in, 0, BH_INT64);
    inject(bhir, ++pc, opcode, out, temp, 0, BH_INT64);
    inject(bhir, ++pc, BH_FREE, temp);
    inject(bhir, ++pc, BH_DISCARD, temp);

    return pc-start_pc;
}
/* Main control of the folding mapping.
        1. This routine only folds the 3 true dimensions. T dimension (if in virtual node mode)
           is handled specifically in the caller of this routine.
        2. finished = perm_next( ndims, perm_array[ndims] )
           gets the next permutation. It returns 1 when there is no next permutation.
           For ndims = 3, the permutation sequence is
                0,1,2 --> 0,2,1 --> 1,0,2 --> 1,2,0 --> 2,0,1 --> 2,0,1 --> finished.
        3. fail = find_fold( dims1[ndims1], dims2[ndims2], fold[3][3] )
           searchs a folding schedule, the folding schedule is stored in matrix fold[3][3]
           e.g. fold[i][j] = 3 indicates to unfold dimension i onto dimension j.
           fold[i][i] has no meaning.
           For 3D case as here, there will be at most 2 non-zero, non-diagonal entries.
           Diagonal entries are useless here.
           Further more, when the 2 non-zero entries are in the same row, the virtual cartesian is
           unfolded from the row_id dimension onto the other dimensions in physical cartesian.
           when the 2 entries are in the same coloum, the virtual cartesian is actually
           folded from the physical cartesian.
        4. perform_fold( vir_coord[], phy_coord[], fold[3][3] )
           does the folding following the schedule given by fold[3][3].
 */
static int perm_dims_match( int nd1, int d1[], int c1[], int nd2, int d2[], int c2[] )
{
    int perm[3] = {0,1,2};
    int fold[3][3] = {{0,0,0}, {0,0,0}, {0,0,0}};
    int fail, finished;
    int dd2[3], i;

    fail = 1;
    finished = 0;
    while( !finished )
    {
        for (i=0; i<3; i++) dd2[i] = d2[perm[i]];
        fail = find_fold( nd1, d1, nd2, dd2, fold );
        if (!fail) { break; }
        finished = perm_next( nd2, perm );
    }

    if (fail) return 1;

    perform_fold( nd1, d1, c1, nd2, d2, c2, perm, fold );

    return 0;
}