void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_debug(1, "in png_read_filter_row_paeth4_sse2"); const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); prev += 4; row += 4; rb -= 4; } }
void sk_paeth_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { // Paeth tries to predict pixel d using the pixel to the left of it, a, // and two pixels from the previous row, b and c: // prev: c b // row: a d // The Paeth function predicts d to be whichever of a, b, or c is nearest to p=a+b-c. // The first pixel has no left context, and so uses an Up filter, p = b. // This works naturally with our main loop's p = a+b-c if we force a and c to zero. // Here we zero b and d, which become c and a respectively at the start of the loop. const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { // It's easiest to do this math (particularly, deal with pc) with 16-bit intermediates. c = b; b = _mm_unpacklo_epi8(load<bpp>(prev), zero); a = d; d = _mm_unpacklo_epi8(load<bpp>(row ), zero); __m128i pa = _mm_sub_epi16(b,c), // (p-a) == (a+b-c - a) == (b-c) pb = _mm_sub_epi16(a,c), // (p-b) == (a+b-c - b) == (a-c) pc = _mm_add_epi16(pa,pb); // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) pa = abs_i16(pa); // |p-a| pb = abs_i16(pb); // |p-b| pc = abs_i16(pc); // |p-c| __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); // Paeth breaks ties favoring a over b over c. __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); d = _mm_add_epi8(d, nearest); // Note `_epi8`: we need addition to wrap modulo 255. store<bpp>(row, _mm_packus_epi16(d,d)); prev += bpp; row += bpp; rb -= bpp; } }