void fixPartialRegisterStalls(Code& code) { if (!isX86()) return; PhaseScope phaseScope(code, "fixPartialRegisterStalls"); Vector<BasicBlock*> candidates; for (BasicBlock* block : code) { for (const Inst& inst : *block) { if (hasPartialXmmRegUpdate(inst)) { candidates.append(block); break; } } } // Fortunately, Partial Stalls are rarely used. Return early if no block // cares about them. if (candidates.isEmpty()) return; // For each block, this provides the distance to the last instruction setting each register // on block *entry*. IndexMap<BasicBlock, FPDefDistance> lastDefDistance(code.size()); // Blocks with dirty distance at head. IndexSet<BasicBlock> dirty; // First, we compute the local distance for each block and push it to the successors. for (BasicBlock* block : code) { FPDefDistance localDistance; unsigned distanceToBlockEnd = block->size(); for (Inst& inst : *block) updateDistances(inst, localDistance, distanceToBlockEnd); for (BasicBlock* successor : block->successorBlocks()) { if (lastDefDistance[successor].updateFromPrecessor(localDistance)) dirty.add(successor); } } // Now we propagate the minimums accross blocks. bool changed; do { changed = false; for (BasicBlock* block : code) { if (!dirty.remove(block)) continue; // Little shortcut: if the block is big enough, propagating it won't add any information. if (block->size() >= minimumSafeDistance) continue; unsigned blockSize = block->size(); FPDefDistance& blockDistance = lastDefDistance[block]; for (BasicBlock* successor : block->successorBlocks()) { if (lastDefDistance[successor].updateFromPrecessor(blockDistance, blockSize)) { dirty.add(successor); changed = true; } } } } while (changed); // Finally, update each block as needed. InsertionSet insertionSet(code); for (BasicBlock* block : candidates) { unsigned distanceToBlockEnd = block->size(); FPDefDistance& localDistance = lastDefDistance[block]; for (unsigned i = 0; i < block->size(); ++i) { Inst& inst = block->at(i); if (hasPartialXmmRegUpdate(inst)) { RegisterSet defs; RegisterSet uses; inst.forEachTmp([&] (Tmp& tmp, Arg::Role role, Arg::Type) { if (tmp.isFPR()) { if (Arg::isDef(role)) defs.set(tmp.fpr()); if (Arg::isAnyUse(role)) uses.set(tmp.fpr()); } }); // We only care about values we define but not use. Otherwise we have to wait // for the value to be resolved anyway. defs.exclude(uses); defs.forEach([&] (Reg reg) { if (localDistance.distance[MacroAssembler::fpRegisterIndex(reg.fpr())] < minimumSafeDistance) insertionSet.insert(i, MoveZeroToDouble, inst.origin, Tmp(reg)); }); } updateDistances(inst, localDistance, distanceToBlockEnd); } insertionSet.execute(block); } }
CollectionChangeBuilder CollectionChangeBuilder::calculate(std::vector<size_t> const& prev_rows, std::vector<size_t> const& next_rows, std::function<bool (size_t)> row_did_change, util::Optional<IndexSet> const& move_candidates) { REALM_ASSERT_DEBUG(!move_candidates || std::is_sorted(begin(next_rows), end(next_rows))); CollectionChangeBuilder ret; size_t deleted = 0; std::vector<RowInfo> old_rows; old_rows.reserve(prev_rows.size()); for (size_t i = 0; i < prev_rows.size(); ++i) { if (prev_rows[i] == IndexSet::npos) { ++deleted; ret.deletions.add(i); } else old_rows.push_back({prev_rows[i], IndexSet::npos, i, i - deleted}); } std::sort(begin(old_rows), end(old_rows), [](auto& lft, auto& rgt) { return lft.row_index < rgt.row_index; }); std::vector<RowInfo> new_rows; new_rows.reserve(next_rows.size()); for (size_t i = 0; i < next_rows.size(); ++i) { new_rows.push_back({next_rows[i], IndexSet::npos, i, 0}); } std::sort(begin(new_rows), end(new_rows), [](auto& lft, auto& rgt) { return lft.row_index < rgt.row_index; }); // Don't add rows which were modified to not match the query to `deletions` // immediately because the unsorted move logic needs to be able to // distinguish them from rows which were outright deleted IndexSet removed; // Now that our old and new sets of rows are sorted by row index, we can // iterate over them and either record old+new TV indices for rows present // in both, or mark them as inserted/deleted if they appear only in one size_t i = 0, j = 0; while (i < old_rows.size() && j < new_rows.size()) { auto old_index = old_rows[i]; auto new_index = new_rows[j]; if (old_index.row_index == new_index.row_index) { new_rows[j].prev_tv_index = old_rows[i].tv_index; new_rows[j].shifted_tv_index = old_rows[i].shifted_tv_index; ++i; ++j; } else if (old_index.row_index < new_index.row_index) { removed.add(old_index.tv_index); ++i; } else { ret.insertions.add(new_index.tv_index); ++j; } } for (; i < old_rows.size(); ++i) removed.add(old_rows[i].tv_index); for (; j < new_rows.size(); ++j) ret.insertions.add(new_rows[j].tv_index); // Filter out the new insertions since we don't need them for any of the // further calculations new_rows.erase(std::remove_if(begin(new_rows), end(new_rows), [](auto& row) { return row.prev_tv_index == IndexSet::npos; }), end(new_rows)); std::sort(begin(new_rows), end(new_rows), [](auto& lft, auto& rgt) { return lft.tv_index < rgt.tv_index; }); for (auto& row : new_rows) { if (row_did_change(row.row_index)) { ret.modifications.add(row.tv_index); } } if (move_candidates) { calculate_moves_unsorted(new_rows, removed, *move_candidates, ret); } else { calculate_moves_sorted(new_rows, ret); } ret.deletions.add(removed); ret.verify(); #ifdef REALM_DEBUG { // Verify that applying the calculated change to prev_rows actually produces next_rows auto rows = prev_rows; auto it = util::make_reverse_iterator(ret.deletions.end()); auto end = util::make_reverse_iterator(ret.deletions.begin()); for (; it != end; ++it) { rows.erase(rows.begin() + it->first, rows.begin() + it->second); } for (auto i : ret.insertions.as_indexes()) { rows.insert(rows.begin() + i, next_rows[i]); } REALM_ASSERT(rows == next_rows); } #endif return ret; }
bool eliminateDeadCode(Code& code) { PhaseScope phaseScope(code, "eliminateDeadCode"); HashSet<Tmp> liveTmps; IndexSet<StackSlot> liveStackSlots; bool changed; auto isArgLive = [&] (const Arg& arg) -> bool { switch (arg.kind()) { case Arg::Tmp: if (arg.isReg()) return true; return liveTmps.contains(arg.tmp()); case Arg::Stack: if (arg.stackSlot()->isLocked()) return true; return liveStackSlots.contains(arg.stackSlot()); default: return true; } }; auto addLiveArg = [&] (const Arg& arg) -> bool { switch (arg.kind()) { case Arg::Tmp: if (arg.isReg()) return false; return liveTmps.add(arg.tmp()).isNewEntry; case Arg::Stack: if (arg.stackSlot()->isLocked()) return false; return liveStackSlots.add(arg.stackSlot()); default: return false; } }; auto isInstLive = [&] (Inst& inst) -> bool { if (inst.hasNonArgEffects()) return true; // This instruction should be presumed dead, if its Args are all dead. bool storesToLive = false; inst.forEachArg( [&] (Arg& arg, Arg::Role role, Arg::Type, Arg::Width) { if (!Arg::isDef(role)) return; storesToLive |= isArgLive(arg); }); return storesToLive; }; auto handleInst = [&] (Inst& inst) { if (!isInstLive(inst)) return; // We get here if the Inst is live. For simplicity we say that a live instruction forces // liveness upon everything it mentions. for (Arg& arg : inst.args) { changed |= addLiveArg(arg); arg.forEachTmpFast( [&] (Tmp& tmp) { changed |= addLiveArg(tmp); }); } }; auto runForward = [&] () -> bool { changed = false; for (BasicBlock* block : code) { for (Inst& inst : *block) handleInst(inst); } return changed; }; auto runBackward = [&] () -> bool { changed = false; for (unsigned blockIndex = code.size(); blockIndex--;) { BasicBlock* block = code[blockIndex]; for (unsigned instIndex = block->size(); instIndex--;) handleInst(block->at(instIndex)); } return changed; }; for (;;) { // Propagating backward is most likely to be profitable. if (!runBackward()) break; if (!runBackward()) break; // Occasionally propagating forward greatly reduces the likelihood of pathologies. if (!runForward()) break; } unsigned removedInstCount = 0; for (BasicBlock* block : code) { removedInstCount += block->insts().removeAllMatching( [&] (Inst& inst) -> bool { return !isInstLive(inst); }); } return !!removedInstCount; }
void allocateStack(Code& code) { PhaseScope phaseScope(code, "allocateStack"); // Perform an escape analysis over stack slots. An escaping stack slot is one that is locked or // is explicitly escaped in the code. IndexSet<StackSlot> escapingStackSlots; for (StackSlot* slot : code.stackSlots()) { if (slot->isLocked()) escapingStackSlots.add(slot); } for (BasicBlock* block : code) { for (Inst& inst : *block) { inst.forEachArg( [&] (Arg& arg, Arg::Role role, Arg::Type, Arg::Width) { if (role == Arg::UseAddr && arg.isStack()) escapingStackSlots.add(arg.stackSlot()); }); } } // Allocate all of the escaped slots in order. This is kind of a crazy algorithm to allow for // the possibility of stack slots being assigned frame offsets before we even get here. ASSERT(!code.frameSize()); Vector<StackSlot*> assignedEscapedStackSlots; Vector<StackSlot*> escapedStackSlotsWorklist; for (StackSlot* slot : code.stackSlots()) { if (escapingStackSlots.contains(slot)) { if (slot->offsetFromFP()) assignedEscapedStackSlots.append(slot); else escapedStackSlotsWorklist.append(slot); } else { // It would be super strange to have an unlocked stack slot that has an offset already. ASSERT(!slot->offsetFromFP()); } } // This is a fairly expensive loop, but it's OK because we'll usually only have a handful of // escaped stack slots. while (!escapedStackSlotsWorklist.isEmpty()) { StackSlot* slot = escapedStackSlotsWorklist.takeLast(); assign(slot, assignedEscapedStackSlots); assignedEscapedStackSlots.append(slot); } // Now we handle the anonymous slots. StackSlotLiveness liveness(code); IndexMap<StackSlot, HashSet<StackSlot*>> interference(code.stackSlots().size()); Vector<StackSlot*> slots; for (BasicBlock* block : code) { StackSlotLiveness::LocalCalc localCalc(liveness, block); auto interfere = [&] (Inst& inst) { if (verbose) dataLog("Interfering: ", WTF::pointerListDump(localCalc.live()), "\n"); inst.forEachArg( [&] (Arg& arg, Arg::Role role, Arg::Type, Arg::Width) { if (!Arg::isDef(role)) return; if (!arg.isStack()) return; StackSlot* slot = arg.stackSlot(); if (slot->kind() != StackSlotKind::Anonymous) return; for (StackSlot* otherSlot : localCalc.live()) { interference[slot].add(otherSlot); interference[otherSlot].add(slot); } }); }; for (unsigned instIndex = block->size(); instIndex--;) { if (verbose) dataLog("Analyzing: ", block->at(instIndex), "\n"); Inst& inst = block->at(instIndex); interfere(inst); localCalc.execute(instIndex); } Inst nop; interfere(nop); } if (verbose) { for (StackSlot* slot : code.stackSlots()) dataLog("Interference of ", pointerDump(slot), ": ", pointerListDump(interference[slot]), "\n"); } // Now we assign stack locations. At its heart this algorithm is just first-fit. For each // StackSlot we just want to find the offsetFromFP that is closest to zero while ensuring no // overlap with other StackSlots that this overlaps with. Vector<StackSlot*> otherSlots = assignedEscapedStackSlots; for (StackSlot* slot : code.stackSlots()) { if (slot->offsetFromFP()) { // Already assigned an offset. continue; } HashSet<StackSlot*>& interferingSlots = interference[slot]; otherSlots.resize(assignedEscapedStackSlots.size()); otherSlots.resize(assignedEscapedStackSlots.size() + interferingSlots.size()); unsigned nextIndex = assignedEscapedStackSlots.size(); for (StackSlot* otherSlot : interferingSlots) otherSlots[nextIndex++] = otherSlot; assign(slot, otherSlots); } // Figure out how much stack we're using for stack slots. unsigned frameSizeForStackSlots = 0; for (StackSlot* slot : code.stackSlots()) { frameSizeForStackSlots = std::max( frameSizeForStackSlots, static_cast<unsigned>(-slot->offsetFromFP())); } frameSizeForStackSlots = WTF::roundUpToMultipleOf(stackAlignmentBytes(), frameSizeForStackSlots); // Now we need to deduce how much argument area we need. for (BasicBlock* block : code) { for (Inst& inst : *block) { for (Arg& arg : inst.args) { if (arg.isCallArg()) { // For now, we assume that we use 8 bytes of the call arg. But that's not // such an awesome assumption. // FIXME: https://bugs.webkit.org/show_bug.cgi?id=150454 ASSERT(arg.offset() >= 0); code.requestCallArgAreaSize(arg.offset() + 8); } } } } code.setFrameSize(frameSizeForStackSlots + code.callArgAreaSize()); // Finally, transform the code to use Addr's instead of StackSlot's. This is a lossless // transformation since we can search the StackSlots array to figure out which StackSlot any // offset-from-FP refers to. for (BasicBlock* block : code) { for (Inst& inst : *block) { for (Arg& arg : inst.args) { switch (arg.kind()) { case Arg::Stack: arg = Arg::addr( Tmp(GPRInfo::callFrameRegister), arg.offset() + arg.stackSlot()->offsetFromFP()); break; case Arg::CallArg: arg = Arg::addr( Tmp(GPRInfo::callFrameRegister), arg.offset() - code.frameSize()); break; default: break; } } } } }
void allocateStack(Code& code) { PhaseScope phaseScope(code, "allocateStack"); // Perform an escape analysis over stack slots. An escaping stack slot is one that is locked or // is explicitly escaped in the code. IndexSet<StackSlot> escapingStackSlots; for (StackSlot* slot : code.stackSlots()) { if (slot->isLocked()) escapingStackSlots.add(slot); } for (BasicBlock* block : code) { for (Inst& inst : *block) { inst.forEachArg( [&] (Arg& arg, Arg::Role role, Arg::Type, Arg::Width) { if (role == Arg::UseAddr && arg.isStack()) escapingStackSlots.add(arg.stackSlot()); }); } } // Allocate all of the escaped slots in order. This is kind of a crazy algorithm to allow for // the possibility of stack slots being assigned frame offsets before we even get here. ASSERT(!code.frameSize()); Vector<StackSlot*> assignedEscapedStackSlots; Vector<StackSlot*> escapedStackSlotsWorklist; for (StackSlot* slot : code.stackSlots()) { if (escapingStackSlots.contains(slot)) { if (slot->offsetFromFP()) assignedEscapedStackSlots.append(slot); else escapedStackSlotsWorklist.append(slot); } else { // It would be super strange to have an unlocked stack slot that has an offset already. ASSERT(!slot->offsetFromFP()); } } // This is a fairly expensive loop, but it's OK because we'll usually only have a handful of // escaped stack slots. while (!escapedStackSlotsWorklist.isEmpty()) { StackSlot* slot = escapedStackSlotsWorklist.takeLast(); assign(slot, assignedEscapedStackSlots); assignedEscapedStackSlots.append(slot); } // Now we handle the anonymous slots. StackSlotLiveness liveness(code); IndexMap<StackSlot, HashSet<StackSlot*>> interference(code.stackSlots().size()); Vector<StackSlot*> slots; for (BasicBlock* block : code) { StackSlotLiveness::LocalCalc localCalc(liveness, block); auto interfere = [&] (unsigned instIndex) { if (verbose) dataLog("Interfering: ", WTF::pointerListDump(localCalc.live()), "\n"); Inst::forEachDef<Arg>( block->get(instIndex), block->get(instIndex + 1), [&] (Arg& arg, Arg::Role, Arg::Type, Arg::Width) { if (!arg.isStack()) return; StackSlot* slot = arg.stackSlot(); if (slot->kind() != StackSlotKind::Anonymous) return; for (StackSlot* otherSlot : localCalc.live()) { interference[slot].add(otherSlot); interference[otherSlot].add(slot); } }); }; for (unsigned instIndex = block->size(); instIndex--;) { if (verbose) dataLog("Analyzing: ", block->at(instIndex), "\n"); // Kill dead stores. For simplicity we say that a store is killable if it has only late // defs and those late defs are to things that are dead right now. We only do that // because that's the only kind of dead stack store we will see here. Inst& inst = block->at(instIndex); if (!inst.hasNonArgEffects()) { bool ok = true; inst.forEachArg( [&] (Arg& arg, Arg::Role role, Arg::Type, Arg::Width) { if (Arg::isEarlyDef(role)) { ok = false; return; } if (!Arg::isLateDef(role)) return; if (!arg.isStack()) { ok = false; return; } StackSlot* slot = arg.stackSlot(); if (slot->kind() != StackSlotKind::Anonymous) { ok = false; return; } if (localCalc.isLive(slot)) { ok = false; return; } }); if (ok) inst = Inst(); } interfere(instIndex); localCalc.execute(instIndex); } interfere(-1); block->insts().removeAllMatching( [&] (const Inst& inst) -> bool { return !inst; }); } if (verbose) { for (StackSlot* slot : code.stackSlots()) dataLog("Interference of ", pointerDump(slot), ": ", pointerListDump(interference[slot]), "\n"); } // Now we assign stack locations. At its heart this algorithm is just first-fit. For each // StackSlot we just want to find the offsetFromFP that is closest to zero while ensuring no // overlap with other StackSlots that this overlaps with. Vector<StackSlot*> otherSlots = assignedEscapedStackSlots; for (StackSlot* slot : code.stackSlots()) { if (slot->offsetFromFP()) { // Already assigned an offset. continue; } HashSet<StackSlot*>& interferingSlots = interference[slot]; otherSlots.resize(assignedEscapedStackSlots.size()); otherSlots.resize(assignedEscapedStackSlots.size() + interferingSlots.size()); unsigned nextIndex = assignedEscapedStackSlots.size(); for (StackSlot* otherSlot : interferingSlots) otherSlots[nextIndex++] = otherSlot; assign(slot, otherSlots); } // Figure out how much stack we're using for stack slots. unsigned frameSizeForStackSlots = 0; for (StackSlot* slot : code.stackSlots()) { frameSizeForStackSlots = std::max( frameSizeForStackSlots, static_cast<unsigned>(-slot->offsetFromFP())); } frameSizeForStackSlots = WTF::roundUpToMultipleOf(stackAlignmentBytes(), frameSizeForStackSlots); // Now we need to deduce how much argument area we need. for (BasicBlock* block : code) { for (Inst& inst : *block) { for (Arg& arg : inst.args) { if (arg.isCallArg()) { // For now, we assume that we use 8 bytes of the call arg. But that's not // such an awesome assumption. // FIXME: https://bugs.webkit.org/show_bug.cgi?id=150454 ASSERT(arg.offset() >= 0); code.requestCallArgAreaSize(arg.offset() + 8); } } } } code.setFrameSize(frameSizeForStackSlots + code.callArgAreaSize()); // Finally, transform the code to use Addr's instead of StackSlot's. This is a lossless // transformation since we can search the StackSlots array to figure out which StackSlot any // offset-from-FP refers to. // FIXME: This may produce addresses that aren't valid if we end up with a ginormous stack frame. // We would have to scavenge for temporaries if this happened. Fortunately, this case will be // extremely rare so we can do crazy things when it arises. // https://bugs.webkit.org/show_bug.cgi?id=152530 InsertionSet insertionSet(code); for (BasicBlock* block : code) { for (unsigned instIndex = 0; instIndex < block->size(); ++instIndex) { Inst& inst = block->at(instIndex); inst.forEachArg( [&] (Arg& arg, Arg::Role role, Arg::Type, Arg::Width width) { auto stackAddr = [&] (int32_t offset) -> Arg { return Arg::stackAddr(offset, code.frameSize(), width); }; switch (arg.kind()) { case Arg::Stack: { StackSlot* slot = arg.stackSlot(); if (Arg::isZDef(role) && slot->kind() == StackSlotKind::Anonymous && slot->byteSize() > Arg::bytes(width)) { // Currently we only handle this simple case because it's the only one // that arises: ZDef's are only 32-bit right now. So, when we hit these // assertions it means that we need to implement those other kinds of // zero fills. RELEASE_ASSERT(slot->byteSize() == 8); RELEASE_ASSERT(width == Arg::Width32); RELEASE_ASSERT(isValidForm(StoreZero32, Arg::Stack)); insertionSet.insert( instIndex + 1, StoreZero32, inst.origin, stackAddr(arg.offset() + 4 + slot->offsetFromFP())); } arg = stackAddr(arg.offset() + slot->offsetFromFP()); break; } case Arg::CallArg: arg = stackAddr(arg.offset() - code.frameSize()); break; default: break; } } ); } insertionSet.execute(block); } }