Value * IDISA_SSE_Builder::hsimd_signmask(unsigned fw, Value * a) {
    // SSE special cases using Intrinsic::x86_sse_movmsk_ps (fw=32 only)
    if (fw == 32) {
        Value * signmask_f32func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_sse_movmsk_ps);
        Type * bitBlock_f32type = VectorType::get(getFloatTy(), mBitBlockWidth/32);
        Value * a_as_ps = CreateBitCast(a, bitBlock_f32type);
        if (mBitBlockWidth == 128) {
            return CreateCall(signmask_f32func, a_as_ps);
        }
    }
    else if ((fw == 64) && (mBitBlockWidth == 256)) {
        Type * bitBlock_f32type = VectorType::get(getFloatTy(), mBitBlockWidth/32);
        Value * a_as_ps = CreateBitCast(a, bitBlock_f32type);
        std::vector<Constant*> Idxs;
        for (unsigned i = 0; i < mBitBlockWidth/fw; i++) {
            Idxs.push_back(getInt32(2*i+1));
        }
        Value * packh = CreateShuffleVector(a_as_ps, UndefValue::get(bitBlock_f32type), ConstantVector::get(Idxs));
        Type * halfBlock_f32type = VectorType::get(getFloatTy(), mBitBlockWidth/64);
        Value * pack_as_ps = CreateBitCast(packh, halfBlock_f32type);
        Value * signmask_f32func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_sse_movmsk_ps);
        Value * mask = CreateCall(signmask_f32func, pack_as_ps);
        return mask;
    }
    // Otherwise use default logic.
    return IDISA_Builder::hsimd_signmask(fw, a);
}
Value * IDISA_SSE2_Builder::hsimd_signmask(unsigned fw, Value * a) {
    // SSE2 special case using Intrinsic::x86_sse2_movmsk_pd (fw=32 only)
    if (mBitBlockWidth == 128) {
        if (fw == 64) {
            Value * signmask_f64func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_sse2_movmsk_pd);
            Type * bitBlock_f64type = VectorType::get(getDoubleTy(), mBitBlockWidth/64);
            Value * a_as_pd = CreateBitCast(a, bitBlock_f64type);
            Value * mask = CreateCall(signmask_f64func, a_as_pd);
            return mask;
        }
        if (fw == 8) {
            Value * pmovmskb_func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_sse2_pmovmskb_128);
            Value * mask = CreateCall(pmovmskb_func, fwCast(8, a));
            return mask;
        }
    }
    int fieldCount = mBitBlockWidth/fw;
    if ((fieldCount > 4) && (fieldCount <= 16)) {
        Value * pmovmskb_func = Intrinsic::getDeclaration(mMod, Intrinsic::x86_sse2_pmovmskb_128);
        int fieldBytes = fw/8;
        int hiByte = fieldBytes - 1;
        std::vector<Constant*> Idxs;
        for (unsigned i = 0; i < fieldCount; i++) {
            Idxs.push_back(getInt32(fieldBytes*i+hiByte));
        }
        for (unsigned i = fieldCount; i < 16; i++) {
            Idxs.push_back(getInt32(mBitBlockWidth/8));
        }
        Value * packh = CreateShuffleVector(fwCast(8, a), fwCast(8, allZeroes()), ConstantVector::get(Idxs));
        Value * mask = CreateCall(pmovmskb_func, packh);
        return mask;
    }
    // Otherwise use default SSE logic.
    return IDISA_SSE_Builder::hsimd_signmask(fw, a);
}
void LLVMColumnMapProjectionBuilder::build(ScanQuery* query) {
    auto& srcRecord = mContext.record();
    auto& destRecord = query->record();

    // -> auto mainPage = reinterpret_cast<const ColumnMapMainPage*>(page);
    auto mainPage = CreateBitCast(getParam(page), mMainPageStructTy->getPointerTo());

    // -> auto count = static_cast<uint64_t>(mainPage->count);
    auto count = CreateInBoundsGEP(mainPage, { getInt64(0), getInt32(0) });
    count = CreateZExt(CreateAlignedLoad(count, 4u), getInt64Ty());

    // -> auto index = static_cast<uint64_t>(idx);
    auto index = CreateZExt(getParam(idx), getInt64Ty());

    if (destRecord.headerSize() != 0u) {
        // -> auto headerOffset = static_cast<uint64_t>(mainPage->headerOffset);
        auto headerOffset = CreateInBoundsGEP(mainPage, { getInt64(0), getInt32(1) });
        headerOffset = CreateZExt(CreateAlignedLoad(headerOffset, 4u), getInt64Ty());

        // -> auto headerData = page + headerOffset + idx;
        auto headerData = CreateAdd(headerOffset, index);
        headerData = CreateInBoundsGEP(getParam(page), headerData);

        auto i = query->projectionBegin();
        for (decltype(destRecord.fieldCount()) destFieldIdx = 0u; destFieldIdx < destRecord.fieldCount();
                ++i, ++destFieldIdx) {
            auto srcFieldIdx = *i;
            auto& srcMeta = srcRecord.getFieldMeta(srcFieldIdx);
            auto& destMeta = destRecord.getFieldMeta(destFieldIdx);
            auto& field = destMeta.field;
            if (field.isNotNull()) {
                continue;
            }

            // -> auto srcData = headerData + page->count * srcNullIdx
            auto srcData = headerData;
            if (srcMeta.nullIdx != 0) {
                srcData = CreateInBoundsGEP(headerData, createConstMul(count, srcMeta.nullIdx));
            }

            // -> auto nullValue = *srcData;
            auto nullValue = CreateAlignedLoad(srcData, 1u);

            // -> auto destData = dest + destNullIdx;
            auto destData = getParam(dest);
            if (destMeta.nullIdx != 0) {
                destData = CreateInBoundsGEP(destData, getInt64(destMeta.nullIdx));
            }

            // -> *destData = srcValue;
            CreateAlignedStore(nullValue, destData, 1u);
        }
    }

    auto i = query->projectionBegin();
    if (destRecord.fixedSizeFieldCount() != 0) {
        // -> auto fixedOffset = static_cast<uint64_t>(mainPage->fixedOffset);
        auto fixedOffset = CreateInBoundsGEP(mainPage, { getInt64(0), getInt32(2) });
        fixedOffset = CreateZExt(CreateAlignedLoad(fixedOffset, 4u), getInt64Ty());

        // -> auto fixedData = page + fixedOffset;
        auto fixedData = CreateInBoundsGEP(getParam(page), fixedOffset);

        for (decltype(destRecord.fixedSizeFieldCount()) destFieldIdx = 0u;
                destFieldIdx < destRecord.fixedSizeFieldCount(); ++i, ++destFieldIdx) {
            auto srcFieldIdx = *i;
            auto& srcMeta = mContext.fixedMetaData()[srcFieldIdx];
            auto& destMeta = destRecord.getFieldMeta(destFieldIdx);
            auto& field = destMeta.field;
            LOG_ASSERT(field.isFixedSized(), "Field must be fixed size");

            auto fieldAlignment = field.alignOf();
            auto fieldPtrType = getFieldPtrTy(field.type());

            // -> auto srcData = reinterpret_cast<const T*>(fixedData + srcMeta.offset) + index;
            auto srcData = fixedData;
            if (srcMeta.offset != 0) {
                srcData = CreateInBoundsGEP(srcData, createConstMul(count, srcMeta.offset));
            }
            srcData = CreateBitCast(srcData, fieldPtrType);
            srcData = CreateInBoundsGEP(srcData, index);

            // -> auto value = *srcData;
            auto value = CreateAlignedLoad(srcData, fieldAlignment);

            // -> auto destData = reinterpret_cast<const T*>(dest + destMeta.offset);
            auto destData = getParam(dest);
            if (destMeta.offset != 0) {
                destData = CreateInBoundsGEP(destData, getInt64(destMeta.offset));
            }
            destData = CreateBitCast(destData, fieldPtrType);

            // -> *destData = value;
            CreateAlignedStore(value, destData, fieldAlignment);
        }
    }

    // -> auto destHeapOffset = destRecord.staticSize();
    llvm::Value* destHeapOffset = getInt32(destRecord.staticSize());

    if (destRecord.varSizeFieldCount() != 0) {
        auto srcFieldIdx = srcRecord.fixedSizeFieldCount();
        decltype(destRecord.varSizeFieldCount()) destFieldIdx = 0;

        // auto variableOffset = static_cast<uint64_t>(mainPage->variableOffset);
        auto variableOffset = CreateInBoundsGEP(mainPage, { getInt64(0), getInt32(3) });
        variableOffset = CreateZExt(CreateAlignedLoad(variableOffset, 4u), getInt64Ty());

        // -> auto variableData = reinterpret_cast<const ColumnMapHeapEntry*>(page + variableOffset) + idx;
        auto variableData = CreateInBoundsGEP(getParam(page), variableOffset);
        variableData = CreateBitCast(variableData, mHeapEntryStructTy->getPointerTo());
        variableData = CreateInBoundsGEP(variableData, index);

        // -> auto srcData = variableData;
        auto srcData = variableData;

        // -> auto destData = reinterpret_cast<uint32_t*>(dest + destRecord.variableOffset());
        auto destData = getParam(dest);
        if (destRecord.variableOffset() != 0) {
            destData = CreateInBoundsGEP(destData, getInt64(destRecord.variableOffset()));
        }
        destData = CreateBitCast(destData, getInt32PtrTy());

        // -> *destData = destHeapOffset;
        CreateAlignedStore(destHeapOffset, destData, 4u);

        do {
            if (*i != srcFieldIdx) {
                auto step = *i - srcFieldIdx;
                // -> srcData += count * (*i - srcFieldIdx);
                srcData = CreateInBoundsGEP(srcData, createConstMul(count, step));
                srcFieldIdx = *i;
            }

            // -> auto srcHeapOffset = srcData->offset;
            auto srcHeapOffset = CreateInBoundsGEP(srcData, { getInt64(0), getInt32(0) });
            srcHeapOffset = CreateAlignedLoad(srcHeapOffset, 8u);

            // -> auto offsetCorrection = srcHeapOffset - destHeapOffset;
            auto offsetCorrection = CreateSub(srcHeapOffset, destHeapOffset);

            llvm::Value* offset;
            do {
                ++i;

                // Step to offset of the following field (or to the last field of the previous element) to get the end
                // offset
                ++srcFieldIdx;
                if (srcFieldIdx == srcRecord.fieldCount()) {
                    // -> srcData = variableData - 1;
                    srcData = CreateGEP(variableData, getInt64(-1));
                } else {
                    // -> srcData += count;
                    srcData = CreateInBoundsGEP(srcData, count);
                }

                // -> auto offset = srcData->offset - offsetCorrection;
                offset = CreateInBoundsGEP(srcData, { getInt64(0), getInt32(0) });
                offset = CreateAlignedLoad(offset, 8u);
                offset = CreateSub(offset, offsetCorrection);

                // -> ++destData;
                ++destFieldIdx;
                destData = CreateInBoundsGEP(destData, getInt64(1));

                // -> *destData = offset;
                CreateAlignedStore(offset, destData, 4u);
            } while (destFieldIdx < destRecord.varSizeFieldCount() && *i == srcFieldIdx);

            // -> auto srcHeap = page + static_cast<uint64_t>(srcHeapOffset);
            auto srcHeap = CreateInBoundsGEP(getParam(page), CreateZExt(srcHeapOffset, getInt64Ty()));

            // -> auto destHeap = dest + static_cast<uint64_t>(destHeapOffset);
            auto destHeap = CreateInBoundsGEP(getParam(dest), CreateZExt(destHeapOffset, getInt64Ty()));

            // -> auto length = offset - destHeapOffset
            auto length = CreateSub(offset, destHeapOffset);

            // -> memcpy(destHeap, srcHeap, length);
            CreateMemCpy(destHeap, srcHeap, length, 1u);

            // -> destHeapOffset = offset;
            destHeapOffset = offset;
        } while (destFieldIdx < destRecord.varSizeFieldCount());
    }

    // -> return destHeapOffset;
    CreateRet(destHeapOffset);
}