static void filter_5x5(ConvolutionContext *s, AVFrame *in, AVFrame *out, int plane) { const uint8_t *src = in->data[plane]; uint8_t *dst = out->data[plane]; const int stride = in->linesize[plane]; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; uint8_t *p0 = s->buffer + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; const int *matrix = s->matrix[plane]; float rdiv = s->rdiv[plane]; float bias = s->bias[plane]; int y, x, i; line_copy8(p0, src + 2 * stride, width, 2); line_copy8(p1, src + stride, width, 2); line_copy8(p2, src, width, 2); src += stride; line_copy8(p3, src, width, 2); for (y = 0; y < height; y++) { uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; src += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, src, width, 2); for (x = 0; x < width; x++) { int sum = 0; for (i = 0; i < 25; i++) { sum += *(array[i] + x) * matrix[i]; } sum = (int)(sum * rdiv + bias + 0.5f); dst[x] = av_clip_uint8(sum); } p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig: p4 + bstride; dst += out->linesize[plane]; } }
static int filter_frame(AVFilterLink *inlink, AVFrame *in) { AVFilterContext *ctx = inlink->dst; AVFilterLink *outlink = ctx->outputs[0]; NContext *s = ctx->priv; AVFrame *out; int plane, y; out = ff_get_video_buffer(outlink, outlink->w, outlink->h); if (!out) { av_frame_free(&in); return AVERROR(ENOMEM); } av_frame_copy_props(out, in); for (plane = 0; plane < s->nb_planes; plane++) { const int threshold = s->threshold[plane]; if (threshold) { const uint8_t *src = in->data[plane]; uint8_t *dst = out->data[plane]; int stride = in->linesize[plane]; int height = s->planeheight[plane]; int width = s->planewidth[plane]; uint8_t *p0 = s->buffer + 16; uint8_t *p1 = p0 + s->planewidth[0]; uint8_t *p2 = p1 + s->planewidth[0]; uint8_t *orig = p0, *end = p2; line_copy8(p0, src + stride, width, 1); line_copy8(p1, src, width, 1); for (y = 0; y < height; y++) { const uint8_t *coordinates[] = { p0 - 1, p0, p0 + 1, p1 - 1, p1 + 1, p2 - 1, p2, p2 + 1}; src += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, src, width, 1); s->filter(dst, p1, width, threshold, coordinates, s->coordinates); p0 = p1; p1 = p2; p2 = (p2 == end) ? orig: p2 + s->planewidth[0]; dst += out->linesize[plane]; } } else { av_image_copy_plane(out->data[plane], out->linesize[plane], in->data[plane], in->linesize[plane], s->planewidth[plane], s->planeheight[plane]); } } av_frame_free(&in); return ff_filter_frame(outlink, out); }
static int filter_sobel(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; const int plane = td->plane; const int stride = in->linesize[plane]; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; const int slice_start = (height * jobnr) / nb_jobs; const int slice_end = (height * (jobnr+1)) / nb_jobs; const uint8_t *src = in->data[plane] + slice_start * stride; uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; const float scale = s->scale; const float delta = s->delta; uint8_t *p0 = s->bptrs[jobnr] + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; int y, x; line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); line_copy8(p1, src, width, 1); for (y = slice_start; y < slice_end; y++) { src += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, src, width, 1); for (x = 0; x < width; x++) { int suma = p0[x - 1] * -1 + p0[x] * -2 + p0[x + 1] * -1 + p2[x - 1] * 1 + p2[x] * 2 + p2[x + 1] * 1; int sumb = p0[x - 1] * -1 + p0[x + 1] * 1 + p1[x - 1] * -2 + p1[x + 1] * 2 + p2[x - 1] * -1 + p2[x + 1] * 1; dst[x] = av_clip_uint8(sqrt(suma*suma + sumb*sumb) * scale + delta); } p0 = p1; p1 = p2; p2 = (p2 == end) ? orig: p2 + bstride; dst += out->linesize[plane]; } return 0; }
static int filter_3x3(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; const int plane = td->plane; const int stride = in->linesize[plane]; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; const int slice_start = (height * jobnr) / nb_jobs; const int slice_end = (height * (jobnr+1)) / nb_jobs; const uint8_t *src = in->data[plane] + slice_start * stride; uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; uint8_t *p0 = s->bptrs[jobnr] + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; const int *matrix = s->matrix[plane]; const float rdiv = s->rdiv[plane]; const float bias = s->bias[plane]; int y, x; line_copy8(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); line_copy8(p1, src, width, 1); for (y = slice_start; y < slice_end; y++) { src += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, src, width, 1); for (x = 0; x < width; x++) { int sum = p0[x - 1] * matrix[0] + p0[x] * matrix[1] + p0[x + 1] * matrix[2] + p1[x - 1] * matrix[3] + p1[x] * matrix[4] + p1[x + 1] * matrix[5] + p2[x - 1] * matrix[6] + p2[x] * matrix[7] + p2[x + 1] * matrix[8]; sum = (int)(sum * rdiv + bias + 0.5f); dst[x] = av_clip_uint8(sum); } p0 = p1; p1 = p2; p2 = (p2 == end) ? orig: p2 + bstride; dst += out->linesize[plane]; } return 0; }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, int th) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; line_copy8(p0, srcp + stride, width, 1); line_copy8(p1, srcp, width, 1); uint8_t threshold = (uint8_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi8((int8_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); uint8_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 16) { __m128i sumlo = zero; __m128i sumhi = zero; for (int i = 0; i < 8; i++) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sumlo = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero)); sumhi = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero)); } sumlo = _mm_srai_epi16(sumlo, 3); sumhi = _mm_srai_epi16(sumhi, 3); sumlo = _mm_packus_epi16(sumlo, sumhi); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu8(src, xth); sumlo = _mm_max_epu8(sumlo, src); sumlo = _mm_min_epu8(sumlo, limit); _mm_store_si128((__m128i *)(dstp + x), sumlo); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static void filter_3x3(ConvolutionContext *s, AVFrame *in, AVFrame *out, int plane) { const uint8_t *src = in->data[plane]; uint8_t *dst = out->data[plane]; const int stride = in->linesize[plane]; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; uint8_t *p0 = s->buffer + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; const int *matrix = s->matrix[plane]; const float rdiv = s->rdiv[plane]; const float bias = s->bias[plane]; int y, x; line_copy8(p0, src + stride, width, 1); line_copy8(p1, src, width, 1); for (y = 0; y < height; y++) { src += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, src, width, 1); for (x = 0; x < width; x++) { int sum = p0[x - 1] * matrix[0] + p0[x] * matrix[1] + p0[x + 1] * matrix[2] + p1[x - 1] * matrix[3] + p1[x] * matrix[4] + p1[x + 1] * matrix[5] + p2[x - 1] * matrix[6] + p2[x] * matrix[7] + p2[x + 1] * matrix[8]; sum = (int)(sum * rdiv + bias + 0.5f); dst[x] = av_clip_uint8(sum); } p0 = p1; p1 = p2; p2 = (p2 == end) ? orig: p2 + bstride; dst += out->linesize[plane]; } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, int th, int *enable) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; uint8_t threshold = th > 255 ? 255 : (uint8_t)th; line_copy8(p0, srcp, width, 1); line_copy8(p1, srcp, width, 1); __m128i xth = _mm_set1_epi8((int8_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); uint8_t *coordinates[] = {p0 - 1, p0, p0 + 1, p1 - 1, p1 + 1, p2 - 1, p2, p2 + 1}; for (int x = 0; x < width; x += 16) { __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i min = src; for (int i = 0; i < 8; i++) { if (enable[i]) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); min = _mm_min_epu8(target, min); } } __m128i limit = _mm_subs_epu8(src, xth); min = _mm_max_epu8(min, limit); _mm_store_si128((__m128i *)(dstp + x), min); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static void VS_CC proc_8bit(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; line_copy8(p0, srcp + stride, width, 1); line_copy8(p1, srcp, width, 1); int th_min = min_int(eh->min, 255); int th_max = min_int(eh->max, 255); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); for (int x = 0; x < width; x++) { int gx = -p0[x - 1] + p0[x + 1] - 2 * p1[x - 1] + 2 * p1[x + 1] - p2[x - 1] + p2[x + 1]; int gy = -p0[x - 1] - 2 * p0[x] - p0[x + 1] + p2[x - 1] + 2 * p2[x] + p2[x + 1]; int g = (int)(sqrtf(gx * gx + gy * gy) + 0.5f); g = g >> eh->rshift; if (g >= th_max) { g = 255; } if (g <= th_min) { g = 0; } dstp[x] = g; } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static int filter_7x7(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; const int plane = td->plane; const int stride = in->linesize[plane]; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; const int slice_start = (height * jobnr) / nb_jobs; const int slice_end = (height * (jobnr+1)) / nb_jobs; const uint8_t *src = in->data[plane] + slice_start * stride; uint8_t *dst = out->data[plane] + slice_start * out->linesize[plane]; uint8_t *p0 = s->bptrs[jobnr] + 32; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *p5 = p4 + bstride; uint8_t *p6 = p5 + bstride; uint8_t *orig = p0, *end = p6; const int *matrix = s->matrix[plane]; float rdiv = s->rdiv[plane]; float bias = s->bias[plane]; int y, x, i; line_copy8(p0, src + 3 * stride * (slice_start < 3 ? 1 : -1), width, 3); line_copy8(p1, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 3); line_copy8(p2, src + stride * (slice_start == 0 ? 1 : -1), width, 3); line_copy8(p3, src, width, 3); src += stride; line_copy8(p4, src, width, 3); src += stride; line_copy8(p5, src, width, 3); for (y = slice_start; y < slice_end; y++) { uint8_t *array[] = { p0 - 3, p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p0 + 3, p1 - 3, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p1 + 3, p2 - 3, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p2 + 3, p3 - 3, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p3 + 3, p4 - 3, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2, p4 + 3, p5 - 3, p5 - 2, p5 - 1, p5, p5 + 1, p5 + 2, p5 + 3, p6 - 3, p6 - 2, p6 - 1, p6, p6 + 1, p6 + 2, p6 + 3, }; src += stride * (y < height - 3 ? 1 : -1); line_copy8(p6, src, width, 3); for (x = 0; x < width; x++) { int sum = 0; for (i = 0; i < 49; i++) { sum += *(array[i] + x) * matrix[i]; } sum = (int)(sum * rdiv + bias + 0.5f); dst[x] = av_clip_uint8(sum); } p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = p5; p5 = p6; p6 = (p6 == end) ? orig: p6 + bstride; dst += out->linesize[plane]; } return 0; }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi16(all1, 15); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; for (int i = 0; i < 5; i++) { matrix_h[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_h[i]), zero); matrix_v[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_v[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); for (int x = 0; x < width; x += 16) { uint8_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; __m128i sum[4]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); sum[2] = _mm_setzero_si128(); sum[3] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, j == 0 ? rdiv_v : rdiv_h); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); if (!ch->saturate) { for (int i = 0; i < 2; i++) { __m128i mask = _mm_cmplt_epi16(sum[i], zero); __m128i temp = _mm_add_epi16(one, _mm_xor_si128(sum[i], all1)); temp = _mm_and_si128(temp, mask); sum[i] = _mm_andnot_si128(mask, sum[i]); sum[i] = _mm_or_si128(sum[i], temp); } } sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max) { uint8_t* p0 = buff + 16; uint8_t* p1 = p0 + bstride; uint8_t* p2 = p1 + bstride; uint8_t* p3 = p2 + bstride; uint8_t* p4 = p3 + bstride; uint8_t* orig = p0; uint8_t* end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min; uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max; __m128i zero = _mm_setzero_si128(); __m128i ab = _mm_set1_epi16(15); __m128i max = _mm_set1_epi8((int8_t)th_max); __m128i min = _mm_set1_epi8((int8_t)th_min); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint8_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 16) { __m128i sumx[2] = {zero, zero}; __m128i sumy[2] = {zero, zero}; for (int i = 0; i < 4; i++) { __m128i xmm0, xmm1, xmul; xmul = _mm_load_si128((__m128i *)ar_mulx[i]); xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul)); sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul)); xmul = _mm_load_si128((__m128i *)ar_muly[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul)); sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul)); } for (int i = 0; i < 2; i++) { __m128i xmax, xmin, mull, mulh; sumx[i] = mm_abs_epi16(sumx[i]); sumy[i] = mm_abs_epi16(sumy[i]); xmax = _mm_max_epi16(sumx[i], sumy[i]); xmin = _mm_min_epi16(sumx[i], sumy[i]); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4); xmax = mm_cast_epi32(mull, mulh); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5); xmin = mm_cast_epi32(mull, mulh); sumx[i] = _mm_adds_epu16(xmax, xmin); sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift); } __m128i out = _mm_packus_epi16(sumx[0], sumx[1]); __m128i temp = _mm_min_epu8(out, max); temp = _mm_cmpeq_epi8(temp, max); out = _mm_or_si128(temp, out); temp = _mm_max_epu8(out, min); temp = _mm_cmpeq_epi8(temp, min); out = _mm_andnot_si128(temp, out); _mm_store_si128((__m128i*)(dstp + x), out); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }