void StoreVisitor::visitStoreInst(StoreInst &I) {
#ifdef MUT_DEBUG
    errs() << "[DEBUG] Found Fence Inst\n";
#endif

    if (onlyAtomic) {
        if (I.isAtomic()) {
            storeInsts.push_back(&I);
        }
    }
    else
        storeInsts.push_back(&I);
}
/// tryAggregating - When scanning forward over instructions, we look for
/// other loads or stores that could be aggregated with this one.
/// Returns the last instruction added (if one was added) since we might have
/// removed some loads or stores and that might invalidate an iterator.
Instruction *AggregateGlobalOpsOpt::tryAggregating(Instruction *StartInst, Value *StartPtr,
    bool DebugThis) {
  if (TD == 0) return 0;

  Module* M = StartInst->getParent()->getParent()->getParent();
  LLVMContext& Context = StartInst->getContext();

  Type* int8Ty = Type::getInt8Ty(Context);
  Type* sizeTy = Type::getInt64Ty(Context);
  Type* globalInt8PtrTy = int8Ty->getPointerTo(globalSpace);
  bool isLoad = isa<LoadInst>(StartInst);
  bool isStore = isa<StoreInst>(StartInst);
  Instruction *lastAddedInsn = NULL;
  Instruction *LastLoadOrStore = NULL;
 
  SmallVector<Instruction*, 8> toRemove;

  // Okay, so we now have a single global load/store. Scan to find
  // all subsequent stores of the same value to offset from the same pointer.
  // Join these together into ranges, so we can decide whether contiguous blocks
  // are stored.
  MemOpRanges Ranges(*TD);
 
  // Put the first store in since we want to preserve the order.
  Ranges.addInst(0, StartInst);

  BasicBlock::iterator BI = StartInst;
  for (++BI; !isa<TerminatorInst>(BI); ++BI) {

    if( isGlobalLoadOrStore(BI, globalSpace, isLoad, isStore) ) {
      // OK!
    } else {
      // If the instruction is readnone, ignore it, otherwise bail out.  We
      // don't even allow readonly here because we don't want something like:
      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
      if (BI->mayWriteToMemory())
        break;
      if (isStore && BI->mayReadFromMemory())
        break;
      continue;
    }

    if ( isStore && isa<StoreInst>(BI) ) {
      StoreInst *NextStore = cast<StoreInst>(BI);
      // If this is a store, see if we can merge it in.
      if (!NextStore->isSimple()) break;

      // Check to see if this store is to a constant offset from the start ptr.
      int64_t Offset;
      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD))
        break;

      Ranges.addStore(Offset, NextStore);
      LastLoadOrStore = NextStore;
    } else {
      LoadInst *NextLoad = cast<LoadInst>(BI);
      if (!NextLoad->isSimple()) break;

      // Check to see if this load is to a constant offset from the start ptr.
      int64_t Offset;
      if (!IsPointerOffset(StartPtr, NextLoad->getPointerOperand(), Offset, *TD))
        break;

      Ranges.addLoad(Offset, NextLoad);
      LastLoadOrStore = NextLoad;
    }
  }

  // If we have no ranges, then we just had a single store with nothing that
  // could be merged in.  This is a very common case of course.
  if (!Ranges.moreThanOneOp())
    return 0;

  // Divide the instructions between StartInst and LastLoadOrStore into
  // addressing, memops, and uses of memops (uses of loads)
  reorderAddressingMemopsUses(StartInst, LastLoadOrStore, DebugThis);

  Instruction* insertBefore = StartInst;
  IRBuilder<> builder(insertBefore);

  // Now that we have full information about ranges, loop over the ranges and
  // emit memcpy's for anything big enough to be worthwhile.
  for (MemOpRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
       I != E; ++I) {
    const MemOpRange &Range = *I;
    Value* oldBaseI = NULL;
    Value* newBaseI = NULL;

    if (Range.TheStores.size() == 1) continue; // Don't bother if there's only one thing...

    builder.SetInsertPoint(insertBefore);

    // Otherwise, we do want to transform this!  Create a new memcpy.
    // Get the starting pointer of the block.
    StartPtr = Range.StartPtr;

    if( DebugThis ) {
      errs() << "base is:";
      StartPtr->dump();
    }

    // Determine alignment
    unsigned Alignment = Range.Alignment;
    if (Alignment == 0) {
      Type *EltType =
        cast<PointerType>(StartPtr->getType())->getElementType();
      Alignment = TD->getABITypeAlignment(EltType);
    }

    Instruction *alloc = NULL;
    Value *globalPtr = NULL;

    // create temporary alloca space to communicate to/from.
    alloc = makeAlloca(int8Ty, "agg.tmp", insertBefore,
                       Range.End-Range.Start, Alignment);

    // Generate the old and new base pointers before we output
    // anything else.
    {
      Type* iPtrTy = TD->getIntPtrType(alloc->getType());
      Type* iNewBaseTy = TD->getIntPtrType(alloc->getType());
      oldBaseI = builder.CreatePtrToInt(StartPtr, iPtrTy, "agg.tmp.oldb.i");
      newBaseI = builder.CreatePtrToInt(alloc, iNewBaseTy, "agg.tmp.newb.i");
    }

    // If storing, do the stores we had into our alloca'd region.
    if( isStore ) {
      for (SmallVector<Instruction*, 16>::const_iterator
           SI = Range.TheStores.begin(),
           SE = Range.TheStores.end(); SI != SE; ++SI) {
        StoreInst* oldStore = cast<StoreInst>(*SI);

        if( DebugThis ) {
          errs() << "have store in range:";
          oldStore->dump();
        }

        Value* ptrToAlloc = rebasePointer(oldStore->getPointerOperand(),
                                          StartPtr, alloc, "agg.tmp",
                                          &builder, *TD, oldBaseI, newBaseI);
        // Old load must not be volatile or atomic... or we shouldn't have put
        // it in ranges
        assert(!(oldStore->isVolatile() || oldStore->isAtomic()));
        StoreInst* newStore =
          builder.CreateStore(oldStore->getValueOperand(), ptrToAlloc);
        newStore->setAlignment(oldStore->getAlignment());
        newStore->takeName(oldStore);
      }
    }

    // cast the pointer that was load/stored to i8 if necessary.
    if( StartPtr->getType()->getPointerElementType() == int8Ty ) {
      globalPtr = StartPtr;
    } else {
      globalPtr = builder.CreatePointerCast(StartPtr, globalInt8PtrTy, "agg.cast");
    }

    // Get a Constant* for the length.
    Constant* len = ConstantInt::get(sizeTy, Range.End-Range.Start, false);

    // Now add the memcpy instruction
    unsigned addrSpaceDst,addrSpaceSrc;
    addrSpaceDst = addrSpaceSrc = 0;
    if( isStore ) addrSpaceDst = globalSpace;
    if( isLoad ) addrSpaceSrc = globalSpace;

    Type *types[3];
    types[0] = PointerType::get(int8Ty, addrSpaceDst);
    types[1] = PointerType::get(int8Ty, addrSpaceSrc);
    types[2] = sizeTy;

    Function *func = Intrinsic::getDeclaration(M, Intrinsic::memcpy, types);

    Value* args[5]; // dst src len alignment isvolatile
    if( isStore ) {
      // it's a store (ie put)
      args[0] = globalPtr;
      args[1] = alloc;
    } else {
      // it's a load (ie get)
      args[0] = alloc;
      args[1] = globalPtr;
    }
    args[2] = len;
    // alignment
    args[3] = ConstantInt::get(Type::getInt32Ty(Context), 0, false);
    // isvolatile
    args[4] = ConstantInt::get(Type::getInt1Ty(Context), 0, false);

    Instruction* aMemCpy = builder.CreateCall(func, args);

    /*
    DEBUG(dbgs() << "Replace ops:\n";
      for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
        dbgs() << *Range.TheStores[i] << '\n';
      dbgs() << "With: " << *AMemSet << '\n');
      */

    if (!Range.TheStores.empty())
      aMemCpy->setDebugLoc(Range.TheStores[0]->getDebugLoc());

    lastAddedInsn = aMemCpy;

    // If loading, load from the memcpy'd region
    if( isLoad ) {
      for (SmallVector<Instruction*, 16>::const_iterator
           SI = Range.TheStores.begin(),
           SE = Range.TheStores.end(); SI != SE; ++SI) {
        LoadInst* oldLoad = cast<LoadInst>(*SI);
        if( DebugThis ) {
          errs() << "have load in range:";
          oldLoad->dump();
        }

        Value* ptrToAlloc = rebasePointer(oldLoad->getPointerOperand(),
                                          StartPtr, alloc, "agg.tmp",
                                          &builder, *TD, oldBaseI, newBaseI);
        // Old load must not be volatile or atomic... or we shouldn't have put
        // it in ranges
        assert(!(oldLoad->isVolatile() || oldLoad->isAtomic()));
        LoadInst* newLoad = builder.CreateLoad(ptrToAlloc);
        newLoad->setAlignment(oldLoad->getAlignment());
        oldLoad->replaceAllUsesWith(newLoad);
        newLoad->takeName(oldLoad);
        lastAddedInsn = newLoad;
      }
    }

    // Save old loads/stores for removal
    for (SmallVector<Instruction*, 16>::const_iterator
         SI = Range.TheStores.begin(),
         SE = Range.TheStores.end(); SI != SE; ++SI) {
      Instruction* insn = *SI;
      toRemove.push_back(insn);
    }
  }

  // Zap all the old loads/stores
  for (SmallVector<Instruction*, 16>::const_iterator
       SI = toRemove.begin(),
       SE = toRemove.end(); SI != SE; ++SI) {
    (*SI)->eraseFromParent();
  }

  return lastAddedInsn;
}