static int filter16_sobel(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; const int plane = td->plane; const int peak = (1 << s->depth) - 1; const int stride = in->linesize[plane] / 2; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; const int slice_start = (height * jobnr) / nb_jobs; const int slice_end = (height * (jobnr+1)) / nb_jobs; const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); const float scale = s->scale; const float delta = s->delta; uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; int y, x; line_copy16(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); line_copy16(p1, src, width, 1); for (y = slice_start; y < slice_end; y++) { src += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, src, width, 1); for (x = 0; x < width; x++) { int suma = p0[x - 1] * -1 + p0[x] * -2 + p0[x + 1] * -1 + p2[x - 1] * 1 + p2[x] * 2 + p2[x + 1] * 1; int sumb = p0[x - 1] * -1 + p0[x + 1] * 1 + p1[x - 1] * -2 + p1[x + 1] * 2 + p2[x - 1] * -1 + p2[x + 1] * 1; dst[x] = av_clip(sqrt(suma*suma + sumb*sumb) * scale + delta, 0, peak); } p0 = p1; p1 = p2; p2 = (p2 == end) ? orig: p2 + bstride; dst += out->linesize[plane] / 2; } return 0; }
static int filter16_3x3(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; const int plane = td->plane; const int peak = (1 << s->depth) - 1; const int stride = in->linesize[plane] / 2; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; const int slice_start = (height * jobnr) / nb_jobs; const int slice_end = (height * (jobnr+1)) / nb_jobs; const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 16; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; const int *matrix = s->matrix[plane]; const float rdiv = s->rdiv[plane]; const float bias = s->bias[plane]; int y, x; line_copy16(p0, src + stride * (slice_start == 0 ? 1 : -1), width, 1); line_copy16(p1, src, width, 1); for (y = slice_start; y < slice_end; y++) { src += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, src, width, 1); for (x = 0; x < width; x++) { int sum = p0[x - 1] * matrix[0] + p0[x] * matrix[1] + p0[x + 1] * matrix[2] + p1[x - 1] * matrix[3] + p1[x] * matrix[4] + p1[x + 1] * matrix[5] + p2[x - 1] * matrix[6] + p2[x] * matrix[7] + p2[x + 1] * matrix[8]; sum = (int)(sum * rdiv + bias + 0.5f); dst[x] = av_clip(sum, 0, peak); } p0 = p1; p1 = p2; p2 = (p2 == end) ? orig: p2 + bstride; dst += out->linesize[plane] / 2; } return 0; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, int th) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; line_copy16(p0, srcp + stride, width, 1); line_copy16(p1, srcp, width, 1); int16_t threshold = (int16_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi16(threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, srcp, width, 1); uint16_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 8) { __m128i sumlo = zero; __m128i sumhi = zero; for (int i = 0; i < 8; i++) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sumlo = _mm_add_epi32(sumlo, _mm_unpacklo_epi16(target, zero)); sumhi = _mm_add_epi32(sumhi, _mm_unpackhi_epi16(target, zero)); } sumlo = _mm_srai_epi32(sumlo, 3); sumhi = _mm_srai_epi32(sumhi, 3); sumlo = mm_cast_epi32(sumlo, sumhi); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu16(src, xth); sumlo = MM_MAX_EPU16(sumlo, src); sumlo = MM_MIN_EPU16(sumlo, limit); _mm_store_si128((__m128i *)(dstp + x), sumlo); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, int th, int *enable) { stride /= 2; bstride /= 2; uint16_t *dstp = (uint16_t *)d; const uint16_t *srcp = (uint16_t *)s; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; uint16_t threshold = (uint16_t)th; line_copy16(p0, srcp, width, 1); line_copy16(p1, srcp, width, 1); __m128i xth = _mm_set1_epi16((int16_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, srcp, width, 1); uint16_t *coordinates[] = {p0 - 1, p0, p0 + 1, p1 - 1, p1 + 1, p2 - 1, p2, p2 + 1}; for (int x = 0; x < width; x += 8) { __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i min = src; for (int i = 0; i < 8; i++) { if (enable[i]) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); min = MM_MIN_EPU16(min, target); } } __m128i limit = _mm_subs_epu16(src, xth); min = MM_MAX_EPU16(min, limit); _mm_store_si128((__m128i *)(dstp + x), min); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static void VS_CC proc_16bit(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; line_copy16(p0, srcp + stride, width, 1); line_copy16(p1, srcp, width, 1); int th_min = min_int(eh->min, plane_max); int th_max = min_int(eh->max, plane_max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, srcp, width, 1); for (int x = 0; x < width; x++) { int gx = -p0[x - 1] + p0[x + 1] - 2 * p1[x - 1] + 2 * p1[x + 1] - p2[x - 1] + p2[x + 1]; int gy = -p0[x - 1] - 2 * p0[x] - p0[x + 1] + p2[x - 1] + 2 * p2[x] + p2[x + 1]; int g = (int)(sqrtf(gx * gx + gy * gy) + 0.5f); g = g >> eh->rshift; if (g >= th_max) { g = plane_max; } if (g <= th_min) { g = 0; } dstp[x] = g; } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static int filter16_7x7(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { ConvolutionContext *s = ctx->priv; ThreadData *td = arg; AVFrame *in = td->in; AVFrame *out = td->out; const int plane = td->plane; const int peak = (1 << s->depth) - 1; const int stride = in->linesize[plane] / 2; const int bstride = s->bstride; const int height = s->planeheight[plane]; const int width = s->planewidth[plane]; const int slice_start = (height * jobnr) / nb_jobs; const int slice_end = (height * (jobnr+1)) / nb_jobs; const uint16_t *src = (const uint16_t *)in->data[plane] + slice_start * stride; uint16_t *dst = (uint16_t *)out->data[plane] + slice_start * (out->linesize[plane] / 2); uint16_t *p0 = (uint16_t *)s->bptrs[jobnr] + 32; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *p5 = p4 + bstride; uint16_t *p6 = p5 + bstride; uint16_t *orig = p0, *end = p6; const int *matrix = s->matrix[plane]; float rdiv = s->rdiv[plane]; float bias = s->bias[plane]; int y, x, i; line_copy16(p0, src + 3 * stride * (slice_start < 3 ? 1 : -1), width, 3); line_copy16(p1, src + 2 * stride * (slice_start < 2 ? 1 : -1), width, 3); line_copy16(p2, src + stride * (slice_start == 0 ? 1 : -1), width, 3); line_copy16(p3, src, width, 3); src += stride; line_copy16(p4, src, width, 3); src += stride; line_copy16(p5, src, width, 3); for (y = slice_start; y < slice_end; y++) { uint16_t *array[] = { p0 - 3, p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p0 + 3, p1 - 3, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p1 + 3, p2 - 3, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p2 + 3, p3 - 3, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p3 + 3, p4 - 3, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2, p4 + 3, p5 - 3, p5 - 2, p5 - 1, p5, p5 + 1, p5 + 2, p5 + 3, p6 - 3, p6 - 2, p6 - 1, p6, p6 + 1, p6 + 2, p6 + 3, }; src += stride * (y < height - 3 ? 1 : -1); line_copy16(p6, src, width, 3); for (x = 0; x < width; x++) { int sum = 0; for (i = 0; i < 25; i++) { sum += *(array[i] + x) * matrix[i]; } sum = (int)(sum * rdiv + bias + 0.5f); dst[x] = av_clip(sum, 0, peak); } p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = p5; p5 = p6; p6 = (p6 == end) ? orig: p6 + bstride; dst += out->linesize[plane] / 2; } return 0; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi32(all1, 31); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; int sign_h[5]; int sign_v[5]; for (int i = 0; i < 5; i++) { sign_h[i] = ch->m_h[i] < 0 ? 1 : 0; sign_v[i] = ch->m_v[i] < 0 ? 1 : 0; uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i]; matrix_h[i] = _mm_set1_epi16((int16_t)val); val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i]; matrix_v[i] = _mm_set1_epi16((int16_t)val); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); for (int x = 0; x < width; x += 8) { uint16_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; int *sign = j == 0 ? sign_v : sign_h; __m128 rdiv = j == 0 ? rdiv_v : rdiv_h; __m128i sum[2]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm1 = _mm_mullo_epi16(xmm0, matrix[i]); xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]); xmm2 = _mm_unpacklo_epi16(xmm1, xmm0); xmm0 = _mm_unpackhi_epi16(xmm1, xmm0); if (sign[i]) { xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1)); xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1)); } sum[0] = _mm_add_epi32(sum[0], xmm2); sum[1] = _mm_add_epi32(sum[1], xmm0); } for (int i = 0; i < 2; i++) { __m128 sumfp; __m128i mask, temp; sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); temp = _mm_srli_epi32(all1, 16); mask = _mm_cmplt_epi32(sum[i], temp); sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask), _mm_andnot_si128(mask, temp)); mask = _mm_cmpgt_epi32(sum[i], zero); if (ch->saturate) { sum[i] = _mm_and_si128(mask, sum[i]); } else { temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1)); sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]), _mm_andnot_si128(mask, temp)); } } sum[0] = mm_cast_epi32(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t* p0 = (uint16_t *)buff + 8; uint16_t* p1 = p0 + bstride; uint16_t* p2 = p1 + bstride; uint16_t* p3 = p2 + bstride; uint16_t* p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 alpha = _mm_set1_ps((float)0.96043387); __m128 beta = _mm_set1_ps((float)0.39782473); __m128i pmax = _mm_set1_epi32(0xFFFF); __m128i min = _mm_set1_epi16((int16_t)eh->min); __m128i max = _mm_set1_epi16((int16_t)eh->max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint16_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 8) { __m128 sumx[2] = {(__m128)zero, (__m128)zero}; __m128 sumy[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 4; i++) { __m128 xmul = _mm_load_ps(ar_mulxf[i]); __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); xmul = _mm_load_ps(ar_mulyf[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); } __m128i out[2]; for (int i = 0; i < 2; i++) { sumx[i] = mm_abs_ps(sumx[i]); sumy[i] = mm_abs_ps(sumy[i]); __m128 t0 = _mm_max_ps(sumx[i], sumy[i]); __m128 t1 = _mm_min_ps(sumx[i], sumy[i]); t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1)); out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift); out[i] = mm_min_epi32(out[i], pmax); } out[0] = mm_cast_epi32(out[0], out[1]); out[1] = MM_MIN_EPU16(out[0], max); out[1] = _mm_cmpeq_epi16(out[1], max); out[0] = _mm_or_si128(out[1], out[0]); out[1] = MM_MAX_EPU16(out[0], min); out[1] = _mm_cmpeq_epi16(out[1], min); out[0] = _mm_andnot_si128(out[1], out[0]); _mm_store_si128((__m128i *)(dstp + x), out[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }