// Transpose back and store static WEBP_INLINE void Store16x4(const __m128i* const p1, const __m128i* const p0, const __m128i* const q0, const __m128i* const q1, uint8_t* r0, uint8_t* r8, int stride) { __m128i t1, p1_s, p0_s, q0_s, q1_s; // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 t1 = *p0; p0_s = _mm_unpacklo_epi8(*p1, t1); p1_s = _mm_unpackhi_epi8(*p1, t1); // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 t1 = *q0; q0_s = _mm_unpacklo_epi8(t1, *q1); q1_s = _mm_unpackhi_epi8(t1, *q1); // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 t1 = p0_s; p0_s = _mm_unpacklo_epi16(t1, q0_s); q0_s = _mm_unpackhi_epi16(t1, q0_s); // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 t1 = p1_s; p1_s = _mm_unpacklo_epi16(t1, q1_s); q1_s = _mm_unpackhi_epi16(t1, q1_s); Store4x4(&p0_s, r0, stride); r0 += 4 * stride; Store4x4(&q0_s, r0, stride); Store4x4(&p1_s, r8, stride); r8 += 4 * stride; Store4x4(&q1_s, r8, stride); }
// Transpose back and store static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, __m128i* p0, __m128i* q0, __m128i* q1) { __m128i t1; // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 t1 = *p0; *p0 = _mm_unpacklo_epi8(*p1, t1); *p1 = _mm_unpackhi_epi8(*p1, t1); // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 t1 = *q0; *q0 = _mm_unpacklo_epi8(t1, *q1); *q1 = _mm_unpackhi_epi8(t1, *q1); // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 t1 = *p0; *p0 = _mm_unpacklo_epi16(t1, *q0); *q0 = _mm_unpackhi_epi16(t1, *q0); // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 t1 = *p1; *p1 = _mm_unpacklo_epi16(t1, *q1); *q1 = _mm_unpackhi_epi16(t1, *q1); Store4x4(p0, r0, stride); r0 += 4 * stride; Store4x4(q0, r0, stride); Store4x4(p1, r8, stride); r8 += 4 * stride; Store4x4(q1, r8, stride); }