static inline void convolve2D_XY(unsigned int const x, unsigned int const y, unsigned char *const out_data, unsigned char const *const in_data, unsigned int const width, unsigned int const height, double const *const kernel, unsigned int const orderX, unsigned int const orderY, unsigned int const targetX, unsigned int const targetY, double const bias) {
    double result_R = 0;
    double result_G = 0;
    double result_B = 0;
    double result_A = 0;

    unsigned int iBegin = Y_LOWER ? targetY-y : 0; // Note that to prevent signed/unsigned problems this requires that y<=targetY (which is true)
    unsigned int iEnd   = Y_UPPER ? height+targetY-y : orderY; // And this requires that y<=height+targetY (which is trivially true), in addition it should be true that height+targetY-y<=orderY (or equivalently y>=height+targetY-orderY, which is true)
    unsigned int jBegin = X_LOWER ? targetX-x : 0;
    unsigned int jEnd   = X_UPPER ? width+targetX-x : orderX;

    for (unsigned int i=iBegin; i<iEnd; i++){
        for (unsigned int j=jBegin; j<jEnd; j++){
            unsigned int index = 4*( x - targetX + j + width*(y - targetY + i) );
            unsigned int kernel_index = orderX-j-1 + orderX*(orderY-i-1);
            double k = PREMULTIPLIED ? kernel[kernel_index] : in_data[index+3] * kernel[kernel_index];
            result_R += in_data[index+0] * k;
            result_G += in_data[index+1] * k;
            result_B += in_data[index+2] * k;
            result_A += in_data[index+3] * kernel[kernel_index];
        }
    }

    unsigned int const out_index = 4*( x + width*y );
    if (PRESERVE_ALPHA) {
        out_data[out_index+3] = in_data[out_index+3];
    } else if (PREMULTIPLIED) {
        out_data[out_index+3] = CLAMP_D_TO_U8(result_A + 255*bias);
    } else {
        out_data[out_index+3] = CLAMP_D_TO_U8(result_A + bias);
    }
    if (PREMULTIPLIED) {
        out_data[out_index+0] = CLAMP_D_TO_U8_ALPHA(result_R + out_data[out_index+3]*bias, out_data[out_index+3]); // CLAMP includes rounding!
        out_data[out_index+1] = CLAMP_D_TO_U8_ALPHA(result_G + out_data[out_index+3]*bias, out_data[out_index+3]);
        out_data[out_index+2] = CLAMP_D_TO_U8_ALPHA(result_B + out_data[out_index+3]*bias, out_data[out_index+3]);
    } else if (out_data[out_index+3]==0) {
        out_data[out_index+0] = 0; // TODO: Is there a more sensible value that can be used here?
        out_data[out_index+1] = 0;
        out_data[out_index+2] = 0;
    } else {
        out_data[out_index+0] = CLAMP_D_TO_U8(result_R / out_data[out_index+3] + bias); // CLAMP includes rounding!
        out_data[out_index+1] = CLAMP_D_TO_U8(result_G / out_data[out_index+3] + bias);
        out_data[out_index+2] = CLAMP_D_TO_U8(result_B / out_data[out_index+3] + bias);
    }
}
    G_GNUC_PURE
    guint32 turbulencePixel(Geom::Point const &p) const {
        int wrapx = _wrapx, wrapy = _wrapy, wrapw = _wrapw, wraph = _wraph;

        double pixel[4];
        double x = p[Geom::X] * _baseFreq[Geom::X];
        double y = p[Geom::Y] * _baseFreq[Geom::Y];
        double ratio = 1.0;

        for (int k = 0; k < 4; ++k)
            pixel[k] = 0.0;

        for(int octave = 0; octave < _octaves; ++octave)
        {
            double tx = x + PerlinOffset;
            double bx = floor(tx);
            double rx0 = tx - bx, rx1 = rx0 - 1.0;
            int bx0 = bx, bx1 = bx0 + 1;

            double ty = y + PerlinOffset;
            double by = floor(ty);
            double ry0 = ty - by, ry1 = ry0 - 1.0;
            int by0 = by, by1 = by0 + 1;

            if (_stitchTiles) {
                if (bx0 >= wrapx) bx0 -= wrapw;
                if (bx1 >= wrapx) bx1 -= wrapw;
                if (by0 >= wrapy) by0 -= wraph;
                if (by1 >= wrapy) by1 -= wraph;
            }
            bx0 &= BMask;
            bx1 &= BMask;
            by0 &= BMask;
            by1 &= BMask;

            int i = _latticeSelector[bx0];
            int j = _latticeSelector[bx1];
            int b00 = _latticeSelector[i + by0];
            int b01 = _latticeSelector[i + by1];
            int b10 = _latticeSelector[j + by0];
            int b11 = _latticeSelector[j + by1];

            double sx = _scurve(rx0);
            double sy = _scurve(ry0);

            double result[4];
            // channel numbering: R=0, G=1, B=2, A=3
            for (int k = 0; k < 4; ++k) {
                double const *qxa = _gradient[b00][k];
                double const *qxb = _gradient[b10][k];
                double a = _lerp(sx, rx0 * qxa[0] + ry0 * qxa[1],
                                     rx1 * qxb[0] + ry0 * qxb[1]);
                double const *qya = _gradient[b01][k];
                double const *qyb = _gradient[b11][k];
                double b = _lerp(sx, rx0 * qya[0] + ry1 * qya[1],
                                     rx1 * qyb[0] + ry1 * qyb[1]);
                result[k] = _lerp(sy, a, b);
            }

            if (_fractalnoise) {
                for (int k = 0; k < 4; ++k)
                    pixel[k] += result[k] / ratio;
            } else {
                for (int k = 0; k < 4; ++k)
                    pixel[k] += fabs(result[k]) / ratio;
            }

            x *= 2;
            y *= 2;
            ratio *= 2;

            if(_stitchTiles)
            {
                // Update stitch values. Subtracting PerlinOffset before the multiplication and
                // adding it afterward simplifies to subtracting it once.
                wrapw *= 2;
                wraph *= 2;
                wrapx = wrapx*2 - PerlinOffset;
                wrapy = wrapy*2 - PerlinOffset;
            }
        }

        if (_fractalnoise) {
            guint32 r = CLAMP_D_TO_U8((pixel[0]*255.0 + 255.0) / 2);
            guint32 g = CLAMP_D_TO_U8((pixel[1]*255.0 + 255.0) / 2);
            guint32 b = CLAMP_D_TO_U8((pixel[2]*255.0 + 255.0) / 2);
            guint32 a = CLAMP_D_TO_U8((pixel[3]*255.0 + 255.0) / 2);
            r = premul_alpha(r, a);
            g = premul_alpha(g, a);
            b = premul_alpha(b, a);
            ASSEMBLE_ARGB32(pxout, a,r,g,b);
            return pxout;
        } else {
            guint32 r = CLAMP_D_TO_U8(pixel[0]*255.0);
            guint32 g = CLAMP_D_TO_U8(pixel[1]*255.0);
            guint32 b = CLAMP_D_TO_U8(pixel[2]*255.0);
            guint32 a = CLAMP_D_TO_U8(pixel[3]*255.0);
            r = premul_alpha(r, a);
            g = premul_alpha(g, a);
            b = premul_alpha(b, a);
            ASSEMBLE_ARGB32(pxout, a,r,g,b);
            return pxout;
        }
    }
int FilterSpecularLighting::render(FilterSlot &slot, FilterUnits const &units) {
    NRPixBlock *in = slot.get(_input);
    if (!in) {
        g_warning("Missing source image for feSpecularLighting (in=%d)", _input);
        return 1;
    }

    NRPixBlock *out = new NRPixBlock;

    //Fvector *L = NULL; //vector to the light

    int w = in->area.x1 - in->area.x0;
    int h = in->area.y1 - in->area.y0;
    int x0 = in->area.x0;
    int y0 = in->area.y0;
    int i, j;
    //As long as FilterRes and kernel unit is not supported we hardcode the
    //default value
    int dx = 1; //TODO setup
    int dy = 1; //TODO setup
    //surface scale
    Geom::Matrix trans = units.get_matrix_primitiveunits2pb();
    gdouble ss = surfaceScale * trans[0];
    gdouble ks = specularConstant; //diffuse lighting constant
    NR::Fvector L, N, LC, H;
    gdouble inter;

    nr_pixblock_setup_fast(out, NR_PIXBLOCK_MODE_R8G8B8A8N,
            in->area.x0, in->area.y0, in->area.x1, in->area.y1,
            true);
    unsigned char *data_i = NR_PIXBLOCK_PX (in);
    unsigned char *data_o = NR_PIXBLOCK_PX (out);
    //No light, nothing to do
    switch (light_type) {
        case DISTANT_LIGHT:
            //the light vector is constant
            {
            DistantLight *dl = new DistantLight(light.distant, lighting_color);
            dl->light_vector(L);
            dl->light_components(LC);
            NR::normalized_sum(H, L, NR::EYE_VECTOR);
            //finish the work
            for (i = 0, j = 0; i < w*h; i++) {
                NR::compute_surface_normal(N, ss, in, i / w, i % w, dx, dy);
                COMPUTE_INTER(inter, N, H, ks, specularExponent);

                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_RED]); // CLAMP includes rounding!
                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_GREEN]);
                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_BLUE]);
                data_o[j] = MAX(MAX(data_o[j-3], data_o[j-2]), data_o[j-1]);
                ++j;
            }
            out->empty = FALSE;
            delete dl;
            }
            break;
        case POINT_LIGHT:
            {
            PointLight *pl = new PointLight(light.point, lighting_color, trans);
            pl->light_components(LC);
        //TODO we need a reference to the filter to determine primitiveUnits
        //if objectBoundingBox is used, use a different matrix for light_vector
        // UPDATE: trans is now correct matrix from primitiveUnits to
        // pixblock coordinates
            //finish the work
            for (i = 0, j = 0; i < w*h; i++) {
                NR::compute_surface_normal(N, ss, in, i / w, i % w, dx, dy);
                pl->light_vector(L,
                        i % w + x0,
                        i / w + y0,
                        ss * (double) data_i[4*i+3]/ 255);
                NR::normalized_sum(H, L, NR::EYE_VECTOR);
                COMPUTE_INTER(inter, N, H, ks, specularExponent);

                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_RED]);
                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_GREEN]);
                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_BLUE]);
                data_o[j] = MAX(MAX(data_o[j-3], data_o[j-2]), data_o[j-1]);
                ++j;
            }
            out->empty = FALSE;
            delete pl;
            }
            break;
        case SPOT_LIGHT:
            {
            SpotLight *sl = new SpotLight(light.spot, lighting_color, trans);
        //TODO we need a reference to the filter to determine primitiveUnits
        //if objectBoundingBox is used, use a different matrix for light_vector
        // UPDATE: trans is now correct matrix from primitiveUnits to
        // pixblock coordinates
            //finish the work
            for (i = 0, j = 0; i < w*h; i++) {
                NR::compute_surface_normal(N, ss, in, i / w, i % w, dx, dy);
                sl->light_vector(L,
                    i % w + x0,
                    i / w + y0,
                    ss * (double) data_i[4*i+3]/ 255);
                sl->light_components(LC, L);
                NR::normalized_sum(H, L, NR::EYE_VECTOR);
                COMPUTE_INTER(inter, N, H, ks, specularExponent);

                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_RED]);
                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_GREEN]);
                data_o[j++] = CLAMP_D_TO_U8(inter * LC[LIGHT_BLUE]);
                data_o[j] = MAX(MAX(data_o[j-3], data_o[j-2]), data_o[j-1]);
                ++j;
            }
            out->empty = FALSE;
            delete sl;
            }
            break;
        //else unknown light source, doing nothing
        case NO_LIGHT:
        default:
            {
            if (light_type != NO_LIGHT)
                g_warning("unknown light source %d", light_type);
            out->empty = false;
            }
    }

    //finishing
    slot.set(_output, out);
    //nr_pixblock_release(in);
    //delete in;
    return 0;
}