Beispiel #1
0
static void filter_5x5(ConvolutionContext *s, AVFrame *in, AVFrame *out, int plane)
{
    const uint8_t *src = in->data[plane];
    uint8_t *dst = out->data[plane];
    const int stride = in->linesize[plane];
    const int bstride = s->bstride;
    const int height = s->planeheight[plane];
    const int width  = s->planewidth[plane];
    uint8_t *p0 = s->buffer + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;
    const int *matrix = s->matrix[plane];
    float rdiv = s->rdiv[plane];
    float bias = s->bias[plane];
    int y, x, i;

    line_copy8(p0, src + 2 * stride, width, 2);
    line_copy8(p1, src + stride, width, 2);
    line_copy8(p2, src, width, 2);
    src += stride;
    line_copy8(p3, src, width, 2);


    for (y = 0; y < height; y++) {
        uint8_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };

        src += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, src, width, 2);

        for (x = 0; x < width; x++) {
            int sum = 0;

            for (i = 0; i < 25; i++) {
                sum += *(array[i] + x) * matrix[i];
            }
            sum = (int)(sum * rdiv + bias + 0.5f);
            dst[x] = av_clip_uint8(sum);
        }

        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig: p4 + bstride;
        dst += out->linesize[plane];
    }
}
Beispiel #2
0
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
    AVFilterContext *ctx = inlink->dst;
    AVFilterLink *outlink = ctx->outputs[0];
    NContext *s = ctx->priv;
    AVFrame *out;
    int plane, y;

    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
    if (!out) {
        av_frame_free(&in);
        return AVERROR(ENOMEM);
    }
    av_frame_copy_props(out, in);

    for (plane = 0; plane < s->nb_planes; plane++) {
        const int threshold = s->threshold[plane];

        if (threshold) {
            const uint8_t *src = in->data[plane];
            uint8_t *dst = out->data[plane];
            int stride = in->linesize[plane];
            int height = s->planeheight[plane];
            int width  = s->planewidth[plane];
            uint8_t *p0 = s->buffer + 16;
            uint8_t *p1 = p0 + s->planewidth[0];
            uint8_t *p2 = p1 + s->planewidth[0];
            uint8_t *orig = p0, *end = p2;

            line_copy8(p0, src + stride, width, 1);
            line_copy8(p1, src, width, 1);

            for (y = 0; y < height; y++) {
                const uint8_t *coordinates[] = { p0 - 1, p0, p0 + 1,
                                                 p1 - 1,     p1 + 1,
                                                 p2 - 1, p2, p2 + 1};
                src += stride * (y < height - 1 ? 1 : -1);
                line_copy8(p2, src, width, 1);

                s->filter(dst, p1, width, threshold, coordinates, s->coordinates);

                p0 = p1;
                p1 = p2;
                p2 = (p2 == end) ? orig: p2 + s->planewidth[0];
                dst += out->linesize[plane];
            }
        } else {
            av_image_copy_plane(out->data[plane], out->linesize[plane],
                                in->data[plane], in->linesize[plane],
                                s->planewidth[plane], s->planeheight[plane]);
        }
    }

    av_frame_free(&in);
    return ff_filter_frame(outlink, out);
}
Beispiel #3
0
static int filter_sobel(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
    ConvolutionContext *s = ctx->priv;
    ThreadData *td = arg;
    AVFrame *in = td->in;
    AVFrame *out = td->out;
    const int plane = td->plane;
    const int stride = in->linesize[plane];
    const int bstride = s->bstride;
    const int height = s->planeheight[plane];
    const int width  = s->planewidth[plane];
    const int slice_start = (height * jobnr) / nb_jobs;
    const int slice_end = (height * (jobnr+1)) / nb_jobs;
    const uint8_t *src = in->data[plane] + slice_start * stride;
    uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane];
    const float scale = s->scale;
    const float delta = s->delta;
    uint8_t *p0 = s->bptrs[jobnr] + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;
    int y, x;

    line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1);
    line_copy8(p1, src, width, 1);

    for (y = slice_start; y < slice_end; y++) {
        src += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, src, width, 1);

        for (x = 0; x < width; x++) {
            int suma = p0[x - 1] * -1 +
                       p0[x] *     -2 +
                       p0[x + 1] * -1 +
                       p2[x - 1] *  1 +
                       p2[x] *      2 +
                       p2[x + 1] *  1;
            int sumb = p0[x - 1] * -1 +
                       p0[x + 1] *  1 +
                       p1[x - 1] * -2 +
                       p1[x + 1] *  2 +
                       p2[x - 1] * -1 +
                       p2[x + 1] *  1;

            dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta);
        }

        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig: p2 + bstride;
        dst += out->linesize[plane];
    }

    return 0;
}
Beispiel #4
0
static int filter_3x3(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
    ConvolutionContext *s = ctx->priv;
    ThreadData *td = arg;
    AVFrame *in = td->in;
    AVFrame *out = td->out;
    const int plane = td->plane;
    const int stride = in->linesize[plane];
    const int bstride = s->bstride;
    const int height = s->planeheight[plane];
    const int width  = s->planewidth[plane];
    const int slice_start = (height * jobnr) / nb_jobs;
    const int slice_end = (height * (jobnr+1)) / nb_jobs;
    const uint8_t *src = in->data[plane] + slice_start * stride;
    uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane];
    uint8_t *p0 = s->bptrs[jobnr] + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;
    const int *matrix = s->matrix[plane];
    const float rdiv = s->rdiv[plane];
    const float bias = s->bias[plane];
    int y, x;

    line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1);
    line_copy8(p1, src, width, 1);

    for (y = slice_start; y < slice_end; y++) {
        src += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, src, width, 1);

        for (x = 0; x < width; x++) {
            int sum = p0[x - 1] * matrix[0] +
                      p0[x] *     matrix[1] +
                      p0[x + 1] * matrix[2] +
                      p1[x - 1] * matrix[3] +
                      p1[x] *     matrix[4] +
                      p1[x + 1] * matrix[5] +
                      p2[x - 1] * matrix[6] +
                      p2[x] *     matrix[7] +
                      p2[x + 1] * matrix[8];
            sum = (int)(sum * rdiv + bias + 0.5f);
            dst[x] = av_clip_uint8(sum);
        }

        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig: p2 + bstride;
        dst += out->linesize[plane];
    }

    return 0;
}
Beispiel #5
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;

    line_copy8(p0, srcp + stride, width, 1);
    line_copy8(p1, srcp, width, 1);

    uint8_t threshold = (uint8_t)th;

    __m128i zero = _mm_setzero_si128();
    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = COORDINATES;
        for (int x = 0; x < width; x += 16) {
            __m128i sumlo = zero;
            __m128i sumhi = zero;

            for (int i = 0; i < 8; i++) {
                __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                sumlo  = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero));
                sumhi  = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero));
            }

            sumlo = _mm_srai_epi16(sumlo, 3);
            sumhi = _mm_srai_epi16(sumhi, 3);
            sumlo = _mm_packus_epi16(sumlo, sumhi);

            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i limit = _mm_adds_epu8(src, xth);

            sumlo = _mm_max_epu8(sumlo, src);
            sumlo = _mm_min_epu8(sumlo, limit);

            _mm_store_si128((__m128i *)(dstp + x), sumlo);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Beispiel #6
0
static void filter_3x3(ConvolutionContext *s, AVFrame *in, AVFrame *out, int plane)
{
    const uint8_t *src = in->data[plane];
    uint8_t *dst = out->data[plane];
    const int stride = in->linesize[plane];
    const int bstride = s->bstride;
    const int height = s->planeheight[plane];
    const int width  = s->planewidth[plane];
    uint8_t *p0 = s->buffer + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;
    const int *matrix = s->matrix[plane];
    const float rdiv = s->rdiv[plane];
    const float bias = s->bias[plane];
    int y, x;

    line_copy8(p0, src + stride, width, 1);
    line_copy8(p1, src, width, 1);

    for (y = 0; y < height; y++) {
        src += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, src, width, 1);

        for (x = 0; x < width; x++) {
            int sum = p0[x - 1] * matrix[0] +
                      p0[x] *     matrix[1] +
                      p0[x + 1] * matrix[2] +
                      p1[x - 1] * matrix[3] +
                      p1[x] *     matrix[4] +
                      p1[x + 1] * matrix[5] +
                      p2[x - 1] * matrix[6] +
                      p2[x] *     matrix[7] +
                      p2[x + 1] * matrix[8];
            sum = (int)(sum * rdiv + bias + 0.5f);
            dst[x] = av_clip_uint8(sum);
        }

        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig: p2 + bstride;
        dst += out->linesize[plane];
    }
}
Beispiel #7
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th, int *enable)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;
    uint8_t threshold = th > 255 ? 255 : (uint8_t)th;

    line_copy8(p0, srcp, width, 1);
    line_copy8(p1, srcp, width, 1);

    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = {p0 - 1, p0, p0 + 1,
                                  p1 - 1,     p1 + 1,
                                  p2 - 1, p2, p2 + 1};
        for (int x = 0; x < width; x += 16) {
            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i min = src;

            for (int i = 0; i < 8; i++) {
                if (enable[i]) {
                    __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                    min = _mm_min_epu8(target, min);
                }
            }

            __m128i limit = _mm_subs_epu8(src, xth);
            min = _mm_max_epu8(min, limit);
            _mm_store_si128((__m128i *)(dstp + x), min);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Beispiel #8
0
static void VS_CC
proc_8bit(uint8_t *buff, int bstride, int width, int height, int stride,
          uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;

    line_copy8(p0, srcp + stride, width, 1);
    line_copy8(p1, srcp, width, 1);

    int th_min = min_int(eh->min, 255);
    int th_max = min_int(eh->max, 255);
    
    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        for (int x = 0; x < width; x++) {
            int gx = -p0[x - 1] + p0[x + 1] - 2 * p1[x - 1] + 2 * p1[x + 1]
                     - p2[x - 1] + p2[x + 1];
            int gy = -p0[x - 1] - 2 * p0[x] - p0[x + 1] + p2[x - 1] + 2 * p2[x]
                     + p2[x + 1];
            int g = (int)(sqrtf(gx * gx + gy * gy) + 0.5f);
            g = g >> eh->rshift;
            if (g >= th_max) {
                g = 255;
            }
            if (g <= th_min) {
                g = 0;
            }
            dstp[x] = g;
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Beispiel #9
0
static int filter_7x7(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
    ConvolutionContext *s = ctx->priv;
    ThreadData *td = arg;
    AVFrame *in = td->in;
    AVFrame *out = td->out;
    const int plane = td->plane;
    const int stride = in->linesize[plane];
    const int bstride = s->bstride;
    const int height = s->planeheight[plane];
    const int width  = s->planewidth[plane];
    const int slice_start = (height * jobnr) / nb_jobs;
    const int slice_end = (height * (jobnr+1)) / nb_jobs;
    const uint8_t *src = in->data[plane] + slice_start * stride;
    uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane];
    uint8_t *p0 = s->bptrs[jobnr] + 32;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *p5 = p4 + bstride;
    uint8_t *p6 = p5 + bstride;
    uint8_t *orig = p0, *end = p6;
    const int *matrix = s->matrix[plane];
    float rdiv = s->rdiv[plane];
    float bias = s->bias[plane];
    int y, x, i;

    line_copy8(p0, src + 3 * stride * (slice_start < 3 ? 1 : -1), width, 3);
    line_copy8(p1, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 3);
    line_copy8(p2, src + stride * (slice_start == 0 ? 1 : -1), width, 3);
    line_copy8(p3, src, width, 3);
    src += stride;
    line_copy8(p4, src, width, 3);
    src += stride;
    line_copy8(p5, src, width, 3);

    for (y = slice_start; y < slice_end; y++) {
        uint8_t *array[] = {
            p0 - 3, p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p0 + 3,
            p1 - 3, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p1 + 3,
            p2 - 3, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p2 + 3,
            p3 - 3, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p3 + 3,
            p4 - 3, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2, p4 + 3,
            p5 - 3, p5 - 2, p5 - 1, p5, p5 + 1, p5 + 2, p5 + 3,
            p6 - 3, p6 - 2, p6 - 1, p6, p6 + 1, p6 + 2, p6 + 3,
        };

        src += stride * (y < height - 3 ? 1 : -1);
        line_copy8(p6, src, width, 3);

        for (x = 0; x < width; x++) {
            int sum = 0;

            for (i = 0; i < 49; i++) {
                sum += *(array[i] + x) * matrix[i];
            }
            sum = (int)(sum * rdiv + bias + 0.5f);
            dst[x] = av_clip_uint8(sum);
        }

        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = p5;
        p5 = p6;
        p6 = (p6 == end) ? orig: p6 + bstride;
        dst += out->linesize[plane];
    }

    return 0;
}
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi16(all1, 15);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    
    __m128i matrix_h[5];
    __m128i matrix_v[5];
    for (int i = 0; i < 5; i++) {
        matrix_h[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_h[i]), zero);
        matrix_v[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_v[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 16) {
            uint8_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                __m128i sum[4];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();
                sum[2] = _mm_setzero_si128();
                sum[3] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);
                    xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                    xmm0 = _mm_unpacklo_epi8(xmm0, zero);

                    xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                    xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                    sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                    sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                    xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                    xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                    sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                    sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
                }

                for (int i = 0; i < 4; i++) {
                    __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, j == 0 ? rdiv_v : rdiv_h);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);
                }

                sum[0] = _mm_packs_epi32(sum[0], sum[1]);
                sum[1] = _mm_packs_epi32(sum[2], sum[3]);

                if (!ch->saturate) {
                    for (int i = 0; i < 2; i++) {
                        __m128i mask = _mm_cmplt_epi16(sum[i], zero);
                        __m128i temp = _mm_add_epi16(one, _mm_xor_si128(sum[i], all1));
                        temp = _mm_and_si128(temp, mask);
                        sum[i] = _mm_andnot_si128(mask, sum[i]);
                        sum[i] = _mm_or_si128(sum[i], temp);
                    }
                }

                sum[0] = _mm_packus_epi16(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Beispiel #11
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, edge_t *eh,
               uint16_t plane_max)
{
    uint8_t* p0 = buff + 16;
    uint8_t* p1 = p0 + bstride;
    uint8_t* p2 = p1 + bstride;
    uint8_t* p3 = p2 + bstride;
    uint8_t* p4 = p3 + bstride;
    uint8_t* orig = p0;
    uint8_t* end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min;
    uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max;

    __m128i zero = _mm_setzero_si128();
    __m128i ab = _mm_set1_epi16(15);
    __m128i max = _mm_set1_epi8((int8_t)th_max);
    __m128i min = _mm_set1_epi8((int8_t)th_min);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint8_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 16) {
            __m128i sumx[2] = {zero, zero};
            __m128i sumy[2] = {zero, zero};

            for (int i = 0; i < 4; i++) {
                __m128i xmm0, xmm1, xmul;
                xmul = _mm_load_si128((__m128i *)ar_mulx[i]);
                xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul));
                sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul));

                xmul = _mm_load_si128((__m128i *)ar_muly[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul));
                sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul));
            }

            for (int i = 0; i < 2; i++) {
                __m128i xmax, xmin, mull, mulh;
                sumx[i] = mm_abs_epi16(sumx[i]);
                sumy[i] = mm_abs_epi16(sumy[i]);
                xmax = _mm_max_epi16(sumx[i], sumy[i]);
                xmin = _mm_min_epi16(sumx[i], sumy[i]);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4);
                xmax = mm_cast_epi32(mull, mulh);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5);
                xmin = mm_cast_epi32(mull, mulh);

                sumx[i] = _mm_adds_epu16(xmax, xmin);
                sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift);
            }

            __m128i out = _mm_packus_epi16(sumx[0], sumx[1]);
            __m128i temp = _mm_min_epu8(out, max);
            temp = _mm_cmpeq_epi8(temp, max);
            out = _mm_or_si128(temp, out);

            temp = _mm_max_epu8(out, min);
            temp = _mm_cmpeq_epi8(temp, min);
            out = _mm_andnot_si128(temp, out);

            _mm_store_si128((__m128i*)(dstp + x), out);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Beispiel #12
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride , width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };

        for (int x = 0; x < width; x += 16) {
            __m128i sum[4] = { zero, zero, zero, zero };

            for (int i = 0; i < 25; i++) {
                __m128i xmm0, xmm1, xmm2;

                xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
            }

            for (int i = 0; i < 4; i++) {
                __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                sumfp = _mm_mul_ps(sumfp, rdiv);
                sumfp = _mm_add_ps(sumfp, bias);
                if (!ch->saturate) {
                    sumfp = mm_abs_ps(sumfp);
                }
                sum[i] = _mm_cvttps_epi32(sumfp);
            }

            sum[0] = _mm_packs_epi32(sum[0], sum[1]);
            sum[1] = _mm_packs_epi32(sum[2], sum[3]);
            sum[0] = _mm_packus_epi16(sum[0], sum[1]);

            _mm_store_si128((__m128i *)(dstp + x), sum[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}