예제 #1
0
bool
MAdd::updateForReplacement(MDefinition *ins_)
{
    JS_ASSERT(ins_->isAdd());
    MAdd *ins = ins_->toAdd();
    if (isTruncated())
        setTruncated(ins->isTruncated());
    return true;
}
예제 #2
0
MVar *ComparisonStageIR::compute_linear_index(MVar *left_idx, MVar *right_bound, MVar *right_idx, MBlock *block) const {
    MMul *mul = new MMul(left_idx, right_bound);
    mul->register_for_delete();
    block->add_expr(mul);
    MAdd *add = new MAdd(mul->get_result(), right_idx);
    add->register_for_delete();
    block->add_expr(add);
    return add->get_result();
}
예제 #3
0
// Fold AddIs with one variable and two or more constants into one AddI.
static void AnalyzeAdd(TempAllocator& alloc, MAdd* add) {
  if (add->specialization() != MIRType::Int32 || add->isRecoveredOnBailout()) {
    return;
  }

  if (!add->hasUses()) {
    return;
  }

  JitSpew(JitSpew_FLAC, "analyze add: %s%u", add->opName(), add->id());

  SimpleLinearSum sum = ExtractLinearSum(add);
  if (sum.constant == 0 || !sum.term) {
    return;
  }

  // Determine which operand is the constant.
  int idx = add->getOperand(0)->isConstant() ? 0 : 1;
  if (add->getOperand(idx)->isConstant()) {
    // Do not replace an add where the outcome is the same add instruction.
    MOZ_ASSERT(add->getOperand(idx)->toConstant()->type() == MIRType::Int32);
    if (sum.term == add->getOperand(1 - idx) ||
        sum.constant == add->getOperand(idx)->toConstant()->toInt32()) {
      return;
    }
  }

  MInstruction* rhs = MConstant::New(alloc, Int32Value(sum.constant));
  add->block()->insertBefore(add, rhs);

  MAdd* addNew =
      MAdd::New(alloc, sum.term, rhs, MIRType::Int32, add->truncateKind());

  add->replaceAllLiveUsesWith(addNew);
  add->block()->insertBefore(add, addNew);
  JitSpew(JitSpew_FLAC, "replaced with: %s%u", addNew->opName(), addNew->id());
  JitSpew(JitSpew_FLAC, "and constant: %s%u (%d)", rhs->opName(), rhs->id(),
          sum.constant);

  // Mark the stale nodes as RecoveredOnBailout since the Sink pass has
  // been run before this pass. DCE will then remove the unused nodes.
  markNodesAsRecoveredOnBailout(add);
}
static void
AnalyzeLsh(TempAllocator& alloc, MLsh* lsh)
{
    if (lsh->specialization() != MIRType::Int32)
        return;

    if (lsh->isRecoveredOnBailout())
        return;

    MDefinition* index = lsh->lhs();
    MOZ_ASSERT(index->type() == MIRType::Int32);

    MConstant* shiftValue = lsh->rhs()->maybeConstantValue();
    if (!shiftValue)
        return;

    if (shiftValue->type() != MIRType::Int32 || !IsShiftInScaleRange(shiftValue->toInt32()))
        return;

    Scale scale = ShiftToScale(shiftValue->toInt32());

    int32_t displacement = 0;
    MInstruction* last = lsh;
    MDefinition* base = nullptr;
    while (true) {
        if (!last->hasOneUse())
            break;

        MUseIterator use = last->usesBegin();
        if (!use->consumer()->isDefinition() || !use->consumer()->toDefinition()->isAdd())
            break;

        MAdd* add = use->consumer()->toDefinition()->toAdd();
        if (add->specialization() != MIRType::Int32 || !add->isTruncated())
            break;

        MDefinition* other = add->getOperand(1 - add->indexOf(*use));

        if (MConstant* otherConst = other->maybeConstantValue()) {
            displacement += otherConst->toInt32();
        } else {
            if (base)
                break;
            base = other;
        }

        last = add;
        if (last->isRecoveredOnBailout())
            return;
    }

    if (!base) {
        uint32_t elemSize = 1 << ScaleToShift(scale);
        if (displacement % elemSize != 0)
            return;

        if (!last->hasOneUse())
            return;

        MUseIterator use = last->usesBegin();
        if (!use->consumer()->isDefinition() || !use->consumer()->toDefinition()->isBitAnd())
            return;

        MBitAnd* bitAnd = use->consumer()->toDefinition()->toBitAnd();
        if (bitAnd->isRecoveredOnBailout())
            return;

        MDefinition* other = bitAnd->getOperand(1 - bitAnd->indexOf(*use));
        MConstant* otherConst = other->maybeConstantValue();
        if (!otherConst || otherConst->type() != MIRType::Int32)
            return;

        uint32_t bitsClearedByShift = elemSize - 1;
        uint32_t bitsClearedByMask = ~uint32_t(otherConst->toInt32());
        if ((bitsClearedByShift & bitsClearedByMask) != bitsClearedByMask)
            return;

        bitAnd->replaceAllUsesWith(last);
        return;
    }

    if (base->isRecoveredOnBailout())
        return;

    MEffectiveAddress* eaddr = MEffectiveAddress::New(alloc, base, index, scale, displacement);
    last->replaceAllUsesWith(eaddr);
    last->block()->insertAfter(last, eaddr);
}
// Transform:
//
//   [AddI]
//   addl       $9, %esi
//   [LoadUnboxedScalar]
//   movsd      0x0(%rbx,%rsi,8), %xmm4
//
// into:
//
//   [LoadUnboxedScalar]
//   movsd      0x48(%rbx,%rsi,8), %xmm4
//
// This is possible when the AddI is only used by the LoadUnboxedScalar opcode.
static void
AnalyzeLoadUnboxedScalar(TempAllocator& alloc, MLoadUnboxedScalar* load)
{
    if (load->isRecoveredOnBailout())
        return;

    if (!load->getOperand(1)->isAdd())
        return;

    JitSpew(JitSpew_EAA, "analyze: %s%u", load->opName(), load->id());

    MAdd* add = load->getOperand(1)->toAdd();

    if (add->specialization() != MIRType::Int32 || !add->hasUses() ||
        add->truncateKind() != MDefinition::TruncateKind::Truncate)
    {
        return;
    }

    MDefinition* lhs = add->lhs();
    MDefinition* rhs = add->rhs();
    MDefinition* constant = nullptr;
    MDefinition* node = nullptr;

    if (lhs->isConstant()) {
        constant = lhs;
        node = rhs;
    } else if (rhs->isConstant()) {
        constant = rhs;
        node = lhs;
    } else
        return;

    MOZ_ASSERT(constant->type() == MIRType::Int32);

    size_t storageSize = Scalar::byteSize(load->storageType());
    int32_t c1 = load->offsetAdjustment();
    int32_t c2 = 0;
    if (!SafeMul(constant->maybeConstantValue()->toInt32(), storageSize, &c2))
        return;

    int32_t offset = 0;
    if (!SafeAdd(c1, c2, &offset))
        return;

    JitSpew(JitSpew_EAA, "set offset: %d + %d = %d on: %s%u", c1, c2, offset,
            load->opName(), load->id());
    load->setOffsetAdjustment(offset);
    load->replaceOperand(1, node);

    if (!add->hasLiveDefUses() && DeadIfUnused(add) && add->canRecoverOnBailout()) {
        JitSpew(JitSpew_EAA, "mark as recovered on bailout: %s%u",
                add->opName(), add->id());
        add->setRecoveredOnBailoutUnchecked();
    }
}
static void
AnalyzeLsh(MBasicBlock *block, MLsh *lsh)
{
    if (lsh->specialization() != MIRType_Int32)
        return;

    MDefinition *index = lsh->lhs();
    JS_ASSERT(index->type() == MIRType_Int32);

    MDefinition *shift = lsh->rhs();
    if (!shift->isConstant())
        return;

    Value shiftValue = shift->toConstant()->value();
    if (!shiftValue.isInt32() || !IsShiftInScaleRange(shiftValue.toInt32()))
        return;

    Scale scale = ShiftToScale(shiftValue.toInt32());

    int32_t displacement = 0;
    MInstruction *last = lsh;
    MDefinition *base = nullptr;
    while (true) {
        if (!last->hasOneUse())
            break;

        MUseIterator use = last->usesBegin();
        if (!use->consumer()->isDefinition() || !use->consumer()->toDefinition()->isAdd())
            break;

        MAdd *add = use->consumer()->toDefinition()->toAdd();
        if (add->specialization() != MIRType_Int32 || !add->isTruncated())
            break;

        MDefinition *other = add->getOperand(1 - use->index());

        if (other->isConstant()) {
            displacement += other->toConstant()->value().toInt32();
        } else {
            if (base)
                break;
            base = other;
        }

        last = add;
    }

    if (!base) {
        uint32_t elemSize = 1 << ScaleToShift(scale);
        if (displacement % elemSize != 0)
            return;

        if (!last->hasOneUse())
            return;

        MUseIterator use = last->usesBegin();
        if (!use->consumer()->isDefinition() || !use->consumer()->toDefinition()->isBitAnd())
            return;

        MBitAnd *bitAnd = use->consumer()->toDefinition()->toBitAnd();
        MDefinition *other = bitAnd->getOperand(1 - use->index());
        if (!other->isConstant() || !other->toConstant()->value().isInt32())
            return;

        uint32_t bitsClearedByShift = elemSize - 1;
        uint32_t bitsClearedByMask = ~uint32_t(other->toConstant()->value().toInt32());
        if ((bitsClearedByShift & bitsClearedByMask) != bitsClearedByMask)
            return;

        bitAnd->replaceAllUsesWith(last);
        return;
    }

    MEffectiveAddress *eaddr = MEffectiveAddress::New(base, index, scale, displacement);
    last->replaceAllUsesWith(eaddr);
    block->insertAfter(last, eaddr);
}
예제 #7
0
void ComparisonStageIR::build_stage() {

    assert(!is_tiled() || (is_tiled() && !track_progress()));
    // timer is only allowed for serial loops (just use it to get avg iterations per second or something like that)
    assert(!time_loop() || (time_loop() && !is_parallelized()));

    set_stage_function(create_stage_function());
    set_user_function(create_user_function());
    // stuff before the loop
    // build the return idx
    MVar *loop_start = new MVar(MScalarType::get_long_type()); // don't make a constant b/c it should be updateable
    loop_start->register_for_delete();
    MStatement *set_loop_start = new MStatement(loop_start, MVar::create_constant<long>(0));
    set_loop_start->register_for_delete();
    MStatement *set_result = new MStatement(get_return_idx(), loop_start);
    set_result->register_for_delete();
    set_start_block(new MBlock("start"));
    get_start_block()->register_for_delete();
    get_start_block()->add_expr(set_loop_start);
    get_start_block()->add_expr(set_result);

    // When we don't parallelize, then make the inner loop's index outside of both the loops rather than within
    // the outer loop. This is a hack for llvm because if we have an alloca call within each iteration of the outer loop,
    // we will be "leaking" stack space each time that is called, so moving it outside of the loop prevents that.
    // However, it makes it hard to work with when we then parallelize because the code sees that inner loop index as a
    // free variable that needs to be added to the closure. This is not fun because our index is now a pointer to an index
    // and then we would need to update the index by going through the pointer, etc. Basically, it would cause some hacks on the
    // LLVM side (and unless this becomes something that is needed in the future, I don't want to deal with it).
    // So instead, it is dealt with below. Without parallelization, the inner loop index is initialized outside of the
    // nested loop, and then updated to the correct start right before the inner loop begins execution.
    // When parallelization is turned on, the inner loop index is made INSIDE the outer loop. This is because the
    // parallelized outer loop calls a function every iteration which is the outer loop body, and then within that the
    // inner loop is created. alloca is scoped at the function level, so the inner loop index gets a single alloca
    // in this function call, and then the inner loop is created.
    // This may not be required of other possible back-end languages that we choose, but it will depend on their scoping rules.
    //
    // TL;DR LLVM has function scoping for allocainst, so if we create the inner loop index as so
    // val outer_index...
    // for outer_index...
    //    val inner_index...
    //    for inner_index...
    // every iteration of the outer loop adds space to the stack which isn't released until the function ends. So we want
    // val outer_index...
    // val inner_index...
    // for outer_index...
    //    for inner_index...
    MVar *inner_start = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());

    MBlock *preallocation_block = create_preallocator();
    get_start_block()->add_expr(preallocation_block);

    MTimer *timer = nullptr;
    timer = new MTimer();
    timer->register_for_delete();

    MFor *outer_loop_skeleton_1 = nullptr;
    MFor *inner_loop_skeleton_1 = nullptr;
    MFor *outer_loop_skeleton_2 = nullptr;
    MFor *inner_loop_skeleton_2 = nullptr;
    MBlock *inner_loop_body = nullptr;
    // think of all comparisons as being in an NxM matrix where N is the left input and M is the right input.
    // N is the outermost iteration
    tile_size_N = MVar::create_constant<long>(2);
    tile_size_M = MVar::create_constant<long>(2);

    MVar *final_loop_bound;

    if (!is_tiled() || !is_tileable()) { // No tiling
        // To make sure that the inner loop doesn't get replace with a different bound if parallelizing, copy
        // the bound to a different variable and use that
        MVar *bound_copy = new MVar(MScalarType::get_long_type());
        bound_copy->register_for_delete();
        MStatement *set_copy = new MStatement(bound_copy, get_stage_function()->/*get_args()*/get_loaded_args()[3]);
        set_copy->register_for_delete();
        get_start_block()->add_expr(set_copy);
        // loop components
        MVar *outer_loop_start = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());
        outer_loop_skeleton_1 =
            create_stage_for_loop(outer_loop_start, MVar::create_constant<long>(1),
                                  get_stage_function()->/*get_args()*/get_loaded_args()[1], false, get_start_block());
        if (is_parallelizable() && is_parallelized()) {
            outer_loop_skeleton_1->set_exec_type(PARALLEL);
        }

        MVar *_inner_start = nullptr;
        if ((left_input || right_input) && !_force_commutative) {
            _inner_start = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());
        } else {
            MAdd *add = new MAdd(outer_loop_skeleton_1->get_loop_index(),
                                 MVar::create_constant<long>(1));
            outer_loop_skeleton_1->get_body_block()->add_expr(add);
            add->register_for_delete();
            _inner_start = add->get_result();
        }
        if (!time_loop()) {
            get_start_block()->add_expr(outer_loop_skeleton_1);
        } else {
            get_start_block()->add_expr(timer);
            timer->get_timer_block()->add_expr(outer_loop_skeleton_1);
        }

        MStatement *set_inner_start = new MStatement(inner_start, _inner_start);
        set_inner_start->register_for_delete();
        outer_loop_skeleton_1->get_body_block()->add_expr(set_inner_start);
        MBlock *temp_block = new MBlock();
        temp_block->register_for_delete();
        inner_loop_skeleton_1 = create_stage_for_loop(inner_start, MVar::create_constant<long>(1), bound_copy, true,
                                temp_block);
        // TODO hack, need to add the loop index initialization before the outer loop, but we have to add the outer loop before this since
        // the inner_start depends on the outer loop
        get_start_block()->insert_at(temp_block, get_start_block()->get_exprs().size() - 2); // insert right before the outer loop

        // stuff for calling the user function in the loop
        inner_loop_body = inner_loop_skeleton_1->get_body_block();
    } else if (is_tiled() && is_tileable()) { // tiling
        // loop components
        MDiv *_outer_1_bound =
            new MDiv(get_stage_function()->/*get_args()*/get_loaded_args()[1], tile_size_N);
        _outer_1_bound->register_for_delete();
        MDiv *_inner_1_bound =
            new MDiv(get_stage_function()->/*get_args()*/get_loaded_args()[3], tile_size_M);
        _inner_1_bound->register_for_delete();

        // compensate for when the number of elements isn't a multiple of the tile size
        MAdd *outer_1_bound = new MAdd(_outer_1_bound->get_result(), MVar::create_constant<long>(1));
        outer_1_bound->register_for_delete();
        MAdd *inner_1_bound = new MAdd(_inner_1_bound->get_result(), MVar::create_constant<long>(1));
        inner_1_bound->register_for_delete();
        get_start_block()->add_expr(_outer_1_bound);
        get_start_block()->add_expr(_inner_1_bound);
        get_start_block()->add_expr(outer_1_bound);
        get_start_block()->add_expr(inner_1_bound);

        MVar *outer_loop_start_1 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());
        outer_loop_start_1->override_name("outer_loop_start_1");
        MVar *inner_loop_start_1 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());
        inner_loop_start_1->override_name("inner_loop_start_1");
        MVar *outer_loop_start_2 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());
        outer_loop_start_2->override_name("outer_loop_start_2");
        MVar *inner_loop_start_2 = initialize<long>(MScalarType::get_long_type(), 0, get_start_block());
        inner_loop_start_2->override_name("inner_loop_start_2");

        // n = 0 to N/tile_size_N + 1
        outer_loop_skeleton_1 =
            create_stage_for_loop(outer_loop_start_1, MVar::create_constant<long>(1),
                                  outer_1_bound->get_result(), true, get_start_block());
        outer_loop_skeleton_1->override_name("outer_loop_skeleton1");
//
//        if (!time_loop()) {
//            get_start_block()->add_expr(outer_loop_skeleton_1);
//        } else {
//            get_start_block()->add_expr(timer);
//            timer->get_timer_block()->add_expr(outer_loop_skeleton_1);
//        }

        // m = 0 to M/tile_size_M + 1
        inner_loop_skeleton_1 =
            create_stage_for_loop(inner_loop_start_1, MVar::create_constant<long>(1),
                                  inner_1_bound->get_result(), true, get_start_block());
        inner_loop_skeleton_1->override_name("inner_loop_skeleton1");

        // nn = 0 to tile_size_N
        outer_loop_skeleton_2 = create_stage_for_loop(outer_loop_start_2, MVar::create_constant<long>(1),
                                tile_size_N, true, get_start_block());
        outer_loop_skeleton_2->override_name("outer_loop_skeleton2");

        // mm = 0 to tile_size_M
        inner_loop_skeleton_2 = create_stage_for_loop(inner_loop_start_2, MVar::create_constant<long>(1),
                                tile_size_M, true, get_start_block());
        inner_loop_skeleton_2->override_name("inner_loop_skeleton2");

        if (!time_loop()) {
            get_start_block()->add_expr(outer_loop_skeleton_1);
        } else {
            get_start_block()->add_expr(timer);
            timer->get_timer_block()->add_expr(outer_loop_skeleton_1);
        }


        inner_loop_skeleton_1->get_body_block()->add_expr(outer_loop_skeleton_2);
        outer_loop_skeleton_2->get_body_block()->add_expr(inner_loop_skeleton_2);
        inner_loop_body = inner_loop_skeleton_2->get_body_block();

    }

    MBlock *user_arg_block;

    std::vector<MVar *> args = create_user_function_inputs(&user_arg_block, outer_loop_skeleton_1, outer_loop_skeleton_2,
                               inner_loop_skeleton_1, inner_loop_skeleton_2, nullptr, false,
                               nullptr, nullptr, get_stage_function()->/*get_args()*/get_loaded_args()[1],
                               get_stage_function()->/*get_args()*/get_loaded_args()[3]);
    if (!is_tiled() || !is_tileable()) {
        inner_loop_body->add_expr(user_arg_block);
    } // if tiled, this is already added in the create_user_function_inputs
    inner_loop_body = user_arg_block;

    int bucket_idx = inner_loop_body->get_exprs().size();

    MFunctionCall *call = call_user_function(get_user_function(), args);
    inner_loop_body->add_expr(call);

    // handle the output of the user call
    MBlock *processed_call = process_user_function_call(call, NULL, false);
    inner_loop_body->add_expr(processed_call);

    // do any other postprocessing needed in the loop before the next iteration
    MBlock *extra = loop_extras();
    inner_loop_body->add_expr(extra);

    if (track_progress() && !is_parallelized()) {
        // still return the original loop bound
        MBlock *temp = new MBlock();
        temp->register_for_delete();
        final_loop_bound = outer_loop_skeleton_1->get_loop_bound();
        outer_loop_skeleton_1->get_body_block()->add_expr(inner_loop_skeleton_1);
        inner_loop_body->insert_at(apply_buckets(args[0], args[1], inner_loop_skeleton_2 ? inner_loop_skeleton_2 : inner_loop_skeleton_1), bucket_idx);
        std::pair<MFor *, MFor *> splits = ProgressTracker::create_progress_tracker(outer_loop_skeleton_1,
                                           inner_loop_skeleton_1,
                                           get_num_tracking_splits(), temp,
                                           true);

        // find the original outer_loop_skeleton_1 in the block and remove it. Then replace with the new one in splits.first
        int idx = 0;
        if (!time_loop()) {
            for (std::vector<MExpr *>::const_iterator iter = get_start_block()->get_exprs().cbegin();
                    iter != get_start_block()->get_exprs().cend(); iter++) {
                if (*iter == outer_loop_skeleton_1) {
                    break;
                }
                idx++;
            }
            get_start_block()->remove_at(idx);
        } else {
            for (std::vector<MExpr *>::const_iterator iter = timer->get_timer_block()->get_exprs().cbegin();
                    iter != timer->get_timer_block()->get_exprs().cend(); iter++) {
                if (*iter == outer_loop_skeleton_1) {
                    break;
                }
                idx++;
            }
            timer->get_timer_block()->remove_at(idx);
        }
        outer_loop_skeleton_1 = splits.first;
        // do the replacement
        // outer_loop_skeleton_1 added to temp block in the progress tracker function
        if (!time_loop()) {
            get_stage_function()->add_body_block(temp);
        } else {
            timer->get_timer_block()->insert_at(temp, idx);
        }
    } else {
        outer_loop_skeleton_1->get_body_block()->add_expr(inner_loop_skeleton_1);
        final_loop_bound = outer_loop_skeleton_1->get_loop_bound();
        inner_loop_body->insert_at(apply_buckets(args[0], args[1], inner_loop_skeleton_2 ? inner_loop_skeleton_2 : inner_loop_skeleton_1), bucket_idx);
    }

    // modify this loop if it needs to be parallelized
    if (is_parallelizable() && is_parallelized()) {
        parallelize_main_loop(get_start_block(), outer_loop_skeleton_1, inner_loop_skeleton_1);
    }
//
//    if (is_tiled() && is_tileable()) {
//        inner_loop_skeleton_1->get_body_block()->add_expr(outer_loop_skeleton_2);
//        outer_loop_skeleton_2->get_body_block()->add_expr(inner_loop_skeleton_2);
//    }

    // postprocessing after the outer loop is done (no postprocessing needed after the inner loop since it just goes back to the outer loop)
    MBlock *after_loop = time_loop() ? timer->get_after_timer_block() : outer_loop_skeleton_1->get_end_block();
    MBlock *finished = finish_stage(nullptr, final_loop_bound);
    MBlock *deletion = delete_fields();
    after_loop->add_expr(deletion);
    after_loop->add_expr(finished);

    get_stage_function()->insert_body_block_at(get_start_block(), 1); // insert before the temp block, which would have been added if doing tracking. Insert after the stage arg loading though.
    // the temp block has the loop now, so it can't come before everything else
}
예제 #8
0
// TODO once I get the indexing right, I can fix preallocation so that only the correct number of outputs are preallocated, not just N^2
// TODO Can also fix the number output (does that need to be fixed?)
std::vector<MVar *> ComparisonStageIR::create_user_function_inputs(MBlock **mblock, MFor *outer_loop,
        MFor *outer_tiled_inner,
        MFor *inner_loop, MFor *inner_tiled_inner, MVar *,
        bool, MVar *, MVar *,
        MVar *original_num_inputs_left,
        MVar *original_num_inputs_right) {

    // body of the outer MFor passed in is the inner MFor loop
    std::vector<MVar *> stage_args = get_stage_function()->get_loaded_args();//get_args();
    std::vector<MVar *> args;

    // Think of the indices into the two input arrays as coordinates into a matrix. The outer coordinate is for N, i.e. the row number.
    // The inner coordinate is for M, i.e. the column number.
    MVar *final_outer_coordinate;
    MVar *final_inner_coordinate;

    // get the outer and inner input elements
    // if tiled, the computation for the indices is different
    if (is_tiled() && is_tileable()) {
        if ((left_input || right_input) && !_force_commutative) { // N x M
            assert(original_num_inputs_left && original_num_inputs_right); // sanity check
            MVar *n = outer_loop->get_loop_index();
            MVar *m = inner_loop->get_loop_index();
            MVar *nn = outer_tiled_inner->get_loop_index();
            MVar *mm = inner_tiled_inner->get_loop_index();

            // outer = n * tile_size_N + nn
            final_outer_coordinate = get_element(stage_args[0], n, tile_size_N, nn, outer_tiled_inner->get_body_block(),
                                                 inner_loop, &args, original_num_inputs_left, nullptr);

            // inner = m * M + mm
            final_inner_coordinate = get_element(stage_args[2], m, tile_size_M, mm, inner_tiled_inner->get_body_block(),
                                                 outer_tiled_inner, &args, original_num_inputs_right, mblock);
        } else { // (N^2-N)/2
            assert(original_num_inputs_left && original_num_inputs_right); // sanity check
            MVar *n = outer_loop->get_loop_index();
            MVar *m = inner_loop->get_loop_index();
            MVar *nn = outer_tiled_inner->get_loop_index();
            MVar *mm = inner_tiled_inner->get_loop_index();

            // outer = n * tile_size_N + nn
            final_outer_coordinate = get_element(stage_args[0], n, tile_size_N, nn, outer_tiled_inner->get_body_block(),
                                                 inner_loop, &args, original_num_inputs_left, nullptr); // the outer doesn't change with commutativity

            // this code could almost be handled by get_element, but the conditional is more complex, so I just leave it here for now rather than
            // trying to refactor it.

            // inner = m * M + mm
            int inner_insert_idx = 0;
            MBlock *linear_inner = new MBlock();
            linear_inner->register_for_delete();
            MVar *inner_idx = compute_linear_index(m, tile_size_M, mm, linear_inner);
            inner_tiled_inner->get_body_block()->insert_at(linear_inner, inner_insert_idx++);
            final_inner_coordinate = inner_idx;

            // check that the inner index is still in range (< M) and that it is less than the outer idx
            // TODO this assumes that the integral value of true is 1. In the future, create an MTrue and MFalse type
            // that allows arithmetic to be done on it. Then I can plug in the actual values when generating the back end code, such as LLVM.

            MSLT *is_inner_in_range = new MSLT(inner_idx, original_num_inputs_right);
            is_inner_in_range->register_for_delete();
            inner_tiled_inner->get_body_block()->insert_at(is_inner_in_range, inner_insert_idx++);
            MSLT *is_less_than_outer = new MSLT(inner_idx, final_outer_coordinate);
            is_less_than_outer->register_for_delete();
            is_less_than_outer->override_name("inner_less_than_outer");
            inner_tiled_inner->get_body_block()->insert_at(is_less_than_outer, inner_insert_idx++);

            // since we don't have a compound conditional type (YET), we get the results of the two SLT calls here.
            // If they sum to 2, then both are true since we assume true == 1. This way, we only need a single if
            // statement checking the value of the addition.
            MCast *is_inner_in_range_long = new MCast(is_inner_in_range->get_result(), MScalarType::get_long_type());
            is_inner_in_range_long->register_for_delete();
            inner_tiled_inner->get_body_block()->insert_at(is_inner_in_range_long, inner_insert_idx++);
            MCast *is_less_than_outer_long = new MCast(is_less_than_outer->get_result(), MScalarType::get_long_type());
            is_less_than_outer_long->register_for_delete();
            inner_tiled_inner->get_body_block()->insert_at(is_less_than_outer_long, inner_insert_idx++);
            MAdd *sum_of_conditionals = new MAdd(is_inner_in_range_long->get_casted(), is_less_than_outer_long->get_casted());
            sum_of_conditionals->register_for_delete();
            inner_tiled_inner->get_body_block()->insert_at(sum_of_conditionals, inner_insert_idx++);

            MEq *is_in_range_and_less_than = new MEq(sum_of_conditionals->get_result(), MVar::create_constant<long>(2));
            is_in_range_and_less_than->register_for_delete();
            inner_tiled_inner->get_body_block()->insert_at(is_in_range_and_less_than, inner_insert_idx++);

            MBlock *inner_is_in_range_and_less_than = new MBlock();
            inner_is_in_range_and_less_than->register_for_delete();
            MBlock *inner_not_in_range_nor_less_than = new MBlock();
            inner_not_in_range_nor_less_than->register_for_delete();
            MBlock *dummy_inner = new MBlock();
            dummy_inner->register_for_delete();
            MIfThenElse *inner_ite = new MIfThenElse(is_in_range_and_less_than->get_result(), inner_is_in_range_and_less_than,
                    inner_not_in_range_nor_less_than, dummy_inner, nullptr);
            inner_ite->register_for_delete();
            inner_tiled_inner->get_body_block()->insert_at(inner_ite, inner_insert_idx++);
            inner_ite->override_name("inner_ite");

            // If in range, get the inner element and then go to the innermost tiled loop.
            // Since the innermost loop is already in outer_tiled_inner's body, remove it from there (and any other stuff that should only
            // execute if the we are in range) and then add it to the outer_is_in_range block.
            MIndex *get_inner_input = new MIndex(stage_args[2], inner_idx, create_type<MElementType *>(),
                                                 "inner_input_element");
            get_inner_input->register_for_delete();
            inner_is_in_range_and_less_than->add_expr(get_inner_input);
            args.push_back(get_inner_input->get_result());
            inner_is_in_range_and_less_than->add_exprs(inner_tiled_inner->get_body_block()->remove_range(inner_insert_idx++, -1));

            // If out of range, continue to the next iteration of the outer_tiled_inner_loop
            MContinue *to_nn_loop = new MContinue(outer_tiled_inner);
            to_nn_loop->register_for_delete();
            inner_not_in_range_nor_less_than->add_expr(to_nn_loop);

            *mblock = inner_is_in_range_and_less_than;
        }
    } else { // the loop indices are already setup by this point depending on whether we are NxM or N^2
        MVar *current_outer_idx = outer_loop->get_loop_index();
        MVar *current_inner_idx = inner_loop->get_loop_index();
        final_outer_coordinate = current_outer_idx;
        final_inner_coordinate = current_inner_idx;
        MIndex *outer_element = new MIndex(stage_args[0], current_outer_idx, create_type<MElementType *>(),
                                           "outer_input_element");
        outer_element->register_for_delete();
        MIndex *inner_element = new MIndex(stage_args[2], current_inner_idx, create_type<MElementType *>(),
                                           "inner_input_element");
        inner_element->register_for_delete();
        outer_loop->get_body_block()->add_expr(outer_element);
        inner_loop->get_body_block()->add_expr(inner_element);
        args.push_back(outer_element->get_result());
        args.push_back(inner_element->get_result());
        *mblock = new MBlock();
        (*mblock)->register_for_delete();
    }

    // if this has an output, make the output element
    // this doesn't care if we are tiled or not. The equations are the same since we appropriately set the coordinates
    // above based on tiling or not.
    MVar *final_index;
    if (compareVIO) {
        // First create "shell" for a new Element* to be passed to the user
        MVar *new_element = new MVar(create_type<MElementType*>(), "output_element");
        new_element->register_for_delete();
        // create the statement that will actually initialize the value
        // compute the current output index
        if ((left_input || right_input) && !_force_commutative) { // N x M
            // equation for linearizing the coordinates is:
            // final_outer_coordinate X original_num_inputs_right + final_inner_coordinate
            MMul *mul = new MMul(final_outer_coordinate, original_num_inputs_right);
            mul->register_for_delete();
            (*mblock)->add_expr(mul);
            MAdd *add = new MAdd(mul->get_result(), final_inner_coordinate);
            add->register_for_delete();
            (*mblock)->add_expr(add);
            final_index = add->get_result();
        } else { // N^2 and/or commutative
            // equation for linearizing the coordinates is:
            // [final_outer_coordinate^2 - final_outer_coordinate]/2 + final_inner_coordinate
            // the division term in this equation tells you how many elements have come before you. Then the addition
            // adds on your position in the current row.
            // It's not straightforward like the NxM version because we are only doing comparisons between elements
            // in the lower triangular part of the matrix (excluding the diagonal), so the linear indices from
            // the NxM version would give non-consecutive indices. This basically takes those indices and compresses
            // them down from 0 to however many comparisons we do.
            MMul *squared = new MMul(final_outer_coordinate, final_outer_coordinate);
            squared->register_for_delete();
            (*mblock)->add_expr(squared);
            MSub *sub = new MSub(squared->get_result(), final_outer_coordinate);
            sub->register_for_delete();
            (*mblock)->add_expr(sub);
            MDiv *div = new MDiv(sub->get_result(), MVar::create_constant<long>(2));
            div->register_for_delete();
            (*mblock)->add_expr(div);
            MAdd *add = new MAdd(div->get_result(), final_inner_coordinate);
            add->register_for_delete();
            (*mblock)->add_expr(add);
            final_index = add->get_result();
        }

        MStatement *set_new_element = new MStatement(new_element, nullptr); // nullptr tells it to create a new value
        set_new_element->register_for_delete();
        set_new_element->add_parameter(final_index); // this is the id of the Element to be created
        (*mblock)->add_expr(set_new_element);
        args.push_back(new_element);

        // now set the Element in the output array
        MStatementIdx *set = new MStatementIdx(stage_args[4], new_element, final_index);
        set->register_for_delete();
        (*mblock)->add_expr(set);
    }

    return args;
}