int Expander::expand_reduce1d(bh_ir& bhir, int pc, int thread_limit) { static std::map<int,int> fold_map; int start_pc = pc; bh_instruction& instr = bhir.instr_list[pc]; // Grab the BH_POWER instruction bh_index elements = bh_nelements(instr.operand[1]); if (elements * 2 < thread_limit) return 0; int fold = 0; if (fold_map.find(elements) != fold_map.end()) { fold = fold_map.find(elements)->second; } else { fold = find_fold(elements,thread_limit); fold_map[elements] = fold; } if (fold < 2) return 0; bh_opcode opcode = instr.opcode; instr.opcode = BH_NONE; // Lazy choice... no re-use just NOP it. bh_view out = instr.operand[0]; // Grab operands bh_view in = instr.operand[1]; in.ndim = 2; in.shape[0] = fold; in.shape[1] = elements/fold; in.stride[1] = in.stride[0]; in.stride[0] = in.stride[0]*elements/fold; bh_view temp = make_temp(in.base->type, elements/fold); inject(bhir, ++pc, opcode, temp, in, 0, BH_INT64); inject(bhir, ++pc, opcode, out, temp, 0, BH_INT64); inject(bhir, ++pc, BH_FREE, temp); inject(bhir, ++pc, BH_DISCARD, temp); return pc-start_pc; }
/* Main control of the folding mapping. 1. This routine only folds the 3 true dimensions. T dimension (if in virtual node mode) is handled specifically in the caller of this routine. 2. finished = perm_next( ndims, perm_array[ndims] ) gets the next permutation. It returns 1 when there is no next permutation. For ndims = 3, the permutation sequence is 0,1,2 --> 0,2,1 --> 1,0,2 --> 1,2,0 --> 2,0,1 --> 2,0,1 --> finished. 3. fail = find_fold( dims1[ndims1], dims2[ndims2], fold[3][3] ) searchs a folding schedule, the folding schedule is stored in matrix fold[3][3] e.g. fold[i][j] = 3 indicates to unfold dimension i onto dimension j. fold[i][i] has no meaning. For 3D case as here, there will be at most 2 non-zero, non-diagonal entries. Diagonal entries are useless here. Further more, when the 2 non-zero entries are in the same row, the virtual cartesian is unfolded from the row_id dimension onto the other dimensions in physical cartesian. when the 2 entries are in the same coloum, the virtual cartesian is actually folded from the physical cartesian. 4. perform_fold( vir_coord[], phy_coord[], fold[3][3] ) does the folding following the schedule given by fold[3][3]. */ static int perm_dims_match( int nd1, int d1[], int c1[], int nd2, int d2[], int c2[] ) { int perm[3] = {0,1,2}; int fold[3][3] = {{0,0,0}, {0,0,0}, {0,0,0}}; int fail, finished; int dd2[3], i; fail = 1; finished = 0; while( !finished ) { for (i=0; i<3; i++) dd2[i] = d2[perm[i]]; fail = find_fold( nd1, d1, nd2, dd2, fold ); if (!fail) { break; } finished = perm_next( nd2, perm ); } if (fail) return 1; perform_fold( nd1, d1, c1, nd2, d2, c2, perm, fold ); return 0; }