static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
  ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
  unsigned VecElems, unsigned Stride,
  IRBuilder<> Builder) {

  if (VecElems == 16) {
    for (unsigned i = 0; i < Stride; i++)
      TransposedMatrix[i] = Builder.CreateShuffleVector(
        Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
    return;
  }

  SmallVector<uint32_t, 32> OptimizeShuf;
  Value *Temp[8];

  for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
    genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
      (i + 1) / Stride * 16);
    Temp[i / 2] = Builder.CreateShuffleVector(
      Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
    OptimizeShuf.clear();
  }

  if (VecElems == 32) {
    std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
    return;
  }
  else
    for (unsigned i = 0; i < Stride; i++)
      TransposedMatrix[i] =
      Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
}
static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
                            unsigned VecElems, IRBuilder<> Builder) {
  if (VecElems == 16) {
    for (int i = 0; i < 3; i++)
      Vec[i] = InVec[i];
    return;
  }

  for (unsigned j = 0; j < VecElems / 32; j++)
    for (int i = 0; i < 3; i++)
      Vec[i + j * 3] = Builder.CreateShuffleVector(
          InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));

  if (VecElems == 32)
    return;

  for (int i = 0; i < 3; i++)
    Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
}