Example #1
0
void GGLAssembler::build_blending(
                        component_t& temp,      // incomming fragment / output
                        const pixel_t& pixel,   // framebuffer
                        int component,
                        Scratch& regs)
{
   if (!mInfo[component].blend)
        return;
        
    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA)
        fs = GGL_ONE;
    const int blending = blending_codes(fs, fd);
    if (!temp.size()) {
        // here, blending will produce something which doesn't depend on
        // that component (eg: GL_ZERO:GL_*), so the register has not been
        // allocated yet. Will never be used as a source.
        temp = component_t(regs.obtain(), CORRUPTIBLE);
    }

    // we are doing real blending...
    // fb:          extracted dst
    // fragment:    extracted src
    // temp:        component_t(fragment) and result

    // scoped register allocator
    Scratch scratches(registerFile());
    comment("blending");

    // we can optimize these cases a bit...
    // (1) saturation is not needed
    // (2) we can use only one multiply instead of 2
    // (3) we can reduce the register pressure
    //      R = S*f + D*(1-f) = (S-D)*f + D
    //      R = S*(1-f) + D*f = (D-S)*f + S

    const bool same_factor_opt1 =
        (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) ||
        (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) ||
        (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) ||
        (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA);

    const bool same_factor_opt2 =
        (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) ||
        (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) || 
        (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) ||
        (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA);


    // XXX: we could also optimize these cases:
    // R = S*f + D*f = (S+D)*f
    // R = S*(1-f) + D*(1-f) = (S+D)*(1-f)
    // R = S*D + D*S = 2*S*D


    // see if we need to extract 'component' from the destination (fb)
    integer_t fb;
    if (blending & (BLEND_DST|FACTOR_DST)) { 
        fb.setTo(scratches.obtain(), 32); 
        extract(fb, pixel, component);
        if (mDithering) {
            // XXX: maybe what we should do instead, is simply
            // expand fb -or- fragment to the larger of the two
            if (fb.size() < temp.size()) {
                // for now we expand 'fb' to min(fragment, 8)
                int new_size = temp.size() < 8 ? temp.size() : 8;
                expand(fb, fb, new_size);
            }
        }
    }


    // convert input fragment to integer_t
    if (temp.l && (temp.flags & CORRUPTIBLE)) {
        MOV(AL, 0, temp.reg, reg_imm(temp.reg, LSR, temp.l));
        temp.h -= temp.l;
        temp.l = 0;
    }
    integer_t fragment(temp.reg, temp.size(), temp.flags);

    // if not done yet, convert input fragment to integer_t
    if (temp.l) {
        // here we know temp is not CORRUPTIBLE
        fragment.reg = scratches.obtain();
        MOV(AL, 0, fragment.reg, reg_imm(temp.reg, LSR, temp.l));
        fragment.flags |= CORRUPTIBLE;
    }

    if (!(temp.flags & CORRUPTIBLE)) {
        // temp is not corruptible, but since it's the destination it
        // will be modified, so we need to allocate a new register.
        temp.reg = regs.obtain();
        temp.flags &= ~CORRUPTIBLE;
        fragment.flags &= ~CORRUPTIBLE;
    }

    if ((blending & BLEND_SRC) && !same_factor_opt1) {
        // source (fragment) is needed for the blending stage
        // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1)
        fragment.flags &= ~CORRUPTIBLE;
    }


    if (same_factor_opt1) {
        //  R = S*f + D*(1-f) = (S-D)*f + D
        integer_t factor;
        build_blend_factor(factor, fs, 
                component, pixel, fragment, fb, scratches);
        // fb is always corruptible from this point
        fb.flags |= CORRUPTIBLE;
        build_blendFOneMinusF(temp, factor, fragment, fb);
    } else if (same_factor_opt2) {
        //  R = S*(1-f) + D*f = (D-S)*f + S
        integer_t factor;
        // fb is always corrruptible here
        fb.flags |= CORRUPTIBLE;
        build_blend_factor(factor, fd,
                component, pixel, fragment, fb, scratches);
        build_blendOneMinusFF(temp, factor, fragment, fb);
    } else {
        integer_t src_factor;
        integer_t dst_factor;

        // if destination (fb) is not needed for the blending stage, 
        // then it can be marked as CORRUPTIBLE
        if (!(blending & BLEND_DST)) {
            fb.flags |= CORRUPTIBLE;
        }

        // XXX: try to mark some registers as CORRUPTIBLE
        // in most case we could make those corruptible
        // when we're processing the last component
        // but not always, for instance
        //    when fragment is constant and not reloaded
        //    when fb is needed for logic-ops or masking
        //    when a register is aliased (for instance with mAlphaSource)

        // blend away...
        if (fs==GGL_ZERO) {
            if (fd==GGL_ZERO) {         // R = 0
                // already taken care of
            } else if (fd==GGL_ONE) {   // R = D
                // already taken care of
            } else {                    // R = D*fd
                // compute fd
                build_blend_factor(dst_factor, fd,
                        component, pixel, fragment, fb, scratches);
                mul_factor(temp, fb, dst_factor);
            }
        } else if (fs==GGL_ONE) {
            if (fd==GGL_ZERO) {         // R = S
                // NOP, taken care of
            } else if (fd==GGL_ONE) {   // R = S + D
                component_add(temp, fb, fragment); // args order matters
                component_sat(temp);
            } else {                    // R = S + D*fd
                // compute fd
                build_blend_factor(dst_factor, fd,
                        component, pixel, fragment, fb, scratches);
                mul_factor_add(temp, fb, dst_factor, component_t(fragment));
                component_sat(temp);
            }
        } else {
            // compute fs
            build_blend_factor(src_factor, fs, 
                    component, pixel, fragment, fb, scratches);
            if (fd==GGL_ZERO) {         // R = S*fs
                mul_factor(temp, fragment, src_factor);
            } else if (fd==GGL_ONE) {   // R = S*fs + D
                mul_factor_add(temp, fragment, src_factor, component_t(fb));
                component_sat(temp);
            } else {                    // R = S*fs + D*fd
                mul_factor(temp, fragment, src_factor);
                if (scratches.isUsed(src_factor.reg))
                    scratches.recycle(src_factor.reg);
                // compute fd
                build_blend_factor(dst_factor, fd,
                        component, pixel, fragment, fb, scratches);
                mul_factor_add(temp, fb, dst_factor, temp);
                if (!same_factor_opt1 && !same_factor_opt2) {
                    component_sat(temp);
                }
            }
        }
    }

    // now we can be corrupted (it's the dest)
    temp.flags |= CORRUPTIBLE;
}
int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
{
    int64_t duration = ggl_system_time();

    mBlendFactorCached = 0;
    mBlending = 0;
    mMasking = 0;
    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
    mBuilderContext.needs = needs;
    mBuilderContext.c = c;
    mBuilderContext.Rctx = reserveReg(R0); // context always in R0
    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];

    // ------------------------------------------------------------------------

    decodeLogicOpNeeds(needs);

    decodeTMUNeeds(needs, c);

    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));

    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendSrc == GGL_DST_ALPHA)) {
            mBlendSrc = GGL_ONE;
        }
        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendSrcA == GGL_DST_ALPHA)) {
            mBlendSrcA = GGL_ONE;
        }
        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendDst == GGL_DST_ALPHA)) {
            mBlendDst = GGL_ONE;
        }
        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendDstA == GGL_DST_ALPHA)) {
            mBlendDstA = GGL_ONE;
        }
    }

    // if we need the framebuffer, read it now
    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
                            blending_codes(mBlendSrcA, mBlendDstA);

    // XXX: handle special cases, destination not modified...
    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
        (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
        // Destination unmodified (beware of logic ops)
    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
        (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
        // Destination is zero (beware of logic ops)
    }
    
    int fbComponents = 0;
    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
    for (int i=0 ; i<4 ; i++) {
        const int mask = 1<<i;
        component_info_t& info = mInfo[i];
        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
            fs = GGL_ONE;
        info.masked =   !!(masking & mask);
        info.inDest =   !info.masked && mCbFormat.c[i].h && 
                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
        if (mCbFormat.components >= GGL_LUMINANCE &&
                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
            info.inDest = false;
        }
        info.needed =   (i==GGLFormat::ALPHA) && 
                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
        info.replaced = !!(mTextureMachine.replaced & mask);
        info.iterated = (!info.replaced && (info.inDest || info.needed)); 
        info.smooth =   mSmooth && info.iterated;
        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));

        mBlending |= (info.blend ? mask : 0);
        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
        fbComponents |= mCbFormat.c[i].h ? mask : 0;
    }

    mAllMasked = (mMasking == fbComponents);
    if (mAllMasked) {
        mDithering = 0;
    }
    
    fragment_parts_t parts;

    // ------------------------------------------------------------------------
    prolog();
    // ------------------------------------------------------------------------

    build_scanline_prolog(parts, needs);

    if (registerFile().status())
        return registerFile().status();

    // ------------------------------------------------------------------------
    label("fragment_loop");
    // ------------------------------------------------------------------------
    {
        Scratch regs(registerFile());

        if (mDithering) {
            // update the dither index.
            MOV(AL, 0, parts.count.reg,
                    reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
            ADD(AL, 0, parts.count.reg, parts.count.reg,
                    imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
            MOV(AL, 0, parts.count.reg,
                    reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
        }

        // XXX: could we do an early alpha-test here in some cases?
        // It would probaly be used only with smooth-alpha and no texture
        // (or no alpha component in the texture).

        // Early z-test
        if (mAlphaTest==GGL_ALWAYS) {
            build_depth_test(parts, Z_TEST|Z_WRITE);
        } else {
            // we cannot do the z-write here, because
            // it might be killed by the alpha-test later
            build_depth_test(parts, Z_TEST);
        }

        { // texture coordinates
            Scratch scratches(registerFile());

            // texel generation
            build_textures(parts, regs);
            if (registerFile().status())
                return registerFile().status();
        }

        if ((blending & (FACTOR_DST|BLEND_DST)) || 
                (mMasking && !mAllMasked) ||
                (mLogicOp & LOGIC_OP_DST)) 
        {
            // blending / logic_op / masking need the framebuffer
            mDstPixel.setTo(regs.obtain(), &mCbFormat);

            // load the framebuffer pixel
            comment("fetch color-buffer");
            load(parts.cbPtr, mDstPixel);
        }

        if (registerFile().status())
            return registerFile().status();

        pixel_t pixel;
        int directTex = mTextureMachine.directTexture;
        if (directTex | parts.packed) {
            // note: we can't have both here
            // iterated color or direct texture
            pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
            pixel.flags &= ~CORRUPTIBLE;
        } else {
            if (mDithering) {
                const int ctxtReg = mBuilderContext.Rctx;
                const int mask = GGL_DITHER_SIZE-1;
                parts.dither = reg_t(regs.obtain());
                AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
                ADDR_ADD(AL, 0, parts.dither.reg, ctxtReg, parts.dither.reg);
                LDRB(AL, parts.dither.reg, parts.dither.reg,
                        immed12_pre(GGL_OFFSETOF(ditherMatrix)));
            }
        
            // allocate a register for the resulting pixel
            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);

            build_component(pixel, parts, GGLFormat::ALPHA,    regs);

            if (mAlphaTest!=GGL_ALWAYS) {
                // only handle the z-write part here. We know z-test
                // was successful, as well as alpha-test.
                build_depth_test(parts, Z_WRITE);
            }

            build_component(pixel, parts, GGLFormat::RED,      regs);
            build_component(pixel, parts, GGLFormat::GREEN,    regs);
            build_component(pixel, parts, GGLFormat::BLUE,     regs);

            pixel.flags |= CORRUPTIBLE;
        }

        if (registerFile().status())
            return registerFile().status();
        
        if (pixel.reg == -1) {
            // be defensive here. if we're here it's probably
            // that this whole fragment is a no-op.
            pixel = mDstPixel;
        }
        
        if (!mAllMasked) {
            // logic operation
            build_logic_op(pixel, regs);
    
            // masking
            build_masking(pixel, regs); 
    
            comment("store");
            store(parts.cbPtr, pixel, WRITE_BACK);
        }
    }

    if (registerFile().status())
        return registerFile().status();

    // update the iterated color...
    if (parts.reload != 3) {
        build_smooth_shade(parts);
    }

    // update iterated z
    build_iterate_z(parts);

    // update iterated fog
    build_iterate_f(parts);

    SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
    B(PL, "fragment_loop");
    label("epilog");
    epilog(registerFile().touched());

    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
        if (mDepthTest!=GGL_ALWAYS) {
            label("discard_before_textures");
            build_iterate_texture_coordinates(parts);
        }
        label("discard_after_textures");
        build_smooth_shade(parts);
        build_iterate_z(parts);
        build_iterate_f(parts);
        if (!mAllMasked) {
            ADDR_ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
        }
        SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
        B(PL, "fragment_loop");
        epilog(registerFile().touched());
    }