__m128i test_mm_sub_epi64(__m128i A, __m128i B) { // DAG-LABEL: test_mm_sub_epi64 // DAG: sub <2 x i64> // // ASM-LABEL: test_mm_sub_epi64 // ASM: psubq return _mm_sub_epi64(A, B); }
/* * mixed endian increment, low 64bits stored in hi word to be compatible * with _icm's BSWAP. */ static inline __m128i nextc(__m128i x) { const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0); const __m128i ZERO = _mm_setzero_si128(); x = _mm_add_epi64(x, ONE); __m128i t = _mm_cmpeq_epi64(x, ZERO); t = _mm_unpackhi_epi64(t, ZERO); x = _mm_sub_epi64(x, t); return x; }
HashReturn final_echo(hashState_echo *state, BitSequence *hashval) { __m128i remainingbits; // Add remaining bytes in the buffer state->processed_bits += state->uBufferBytes * 8; remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8); // Pad with 0x80 state->buffer[state->uBufferBytes++] = 0x80; // Enough buffer space for padding in this block? if((state->uBlockLength - state->uBufferBytes) >= 18) { // Pad with zeros memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18)); // Hash size *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; // Processed bits *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; // Last block contains message bits? if(state->uBufferBytes == 1) { state->k = _mm_xor_si128(state->k, state->k); state->k = _mm_sub_epi64(state->k, state->const1536); } else { state->k = _mm_add_epi64(state->k, remainingbits); state->k = _mm_sub_epi64(state->k, state->const1536); } // Compress Compress(state, state->buffer, 1); } else { // Fill with zero and compress memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes); state->k = _mm_add_epi64(state->k, remainingbits); state->k = _mm_sub_epi64(state->k, state->const1536); Compress(state, state->buffer, 1); // Last block memset(state->buffer, 0, state->uBlockLength - 18); // Hash size *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; // Processed bits *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; // Compress the last block state->k = _mm_xor_si128(state->k, state->k); state->k = _mm_sub_epi64(state->k, state->const1536); Compress(state, state->buffer, 1); } // Store the hash value _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); if(state->uHashSize == 512) { _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); } return SUCCESS; }
vec128i vOpen = _mm_set1_epi64(open); vec128i vGap = _mm_set1_epi64(gap); vec128i vZero = _mm_set1_epi64(0); vec128i vNegInf0 = _mm_insert_epi64(vZero, NEG_LIMIT, 1); vec128i vOne = _mm_set1_epi64(1); vec128i vN = _mm_set1_epi64(N); vec128i vGapN = _mm_set1_epi64(gap*N); vec128i vNegOne = _mm_set1_epi64(-1); vec128i vI = _mm_set_epi64(0,1); vec128i vJreset = _mm_set_epi64(0,-1); vec128i vMaxH = vNegInf; vec128i vMaxM = vNegInf; vec128i vMaxS = vNegInf; vec128i vMaxL = vNegInf; vec128i vILimit = _mm_set1_epi64(s1Len); vec128i vILimit1 = _mm_sub_epi64(vILimit, vOne); vec128i vJLimit = _mm_set1_epi64(s2Len); vec128i vJLimit1 = _mm_sub_epi64(vJLimit, vOne); vec128i vIBoundary = _mm_set_epi64( -open-0*gap, -open-1*gap); /* convert _s1 from char to int in range 0-23 */ for (i=0; i<s1Len; ++i) { s1[i] = matrix->mapper[(unsigned char)_s1[i]]; } /* pad back of s1 with dummy values */ for (i=s1Len; i<s1Len_PAD; ++i) { s1[i] = 0; /* point to first matrix row because we don't care */ }
test (__m128i s1, __m128i s2) { return _mm_sub_epi64 (s1, s2); }
__m128i vNegInf = _mm_set1_epi64x(NEG_INF); __m128i vOpen = _mm_set1_epi64x(open); __m128i vGap = _mm_set1_epi64x(gap); __m128i vZero = _mm_set1_epi64x(0); __m128i vOne = _mm_set1_epi64x(1); __m128i vN = _mm_set1_epi64x(N); __m128i vGapN = _mm_set1_epi64x(gap*N); __m128i vNegOne = _mm_set1_epi64x(-1); __m128i vI = _mm_set_epi64x(0,1); __m128i vJreset = _mm_set_epi64x(0,-1); __m128i vMaxScore = vNegInf; __m128i vMaxMatch = vNegInf; __m128i vMaxSimilar = vNegInf; __m128i vMaxLength = vNegInf; __m128i vILimit = _mm_set1_epi64x(s1Len); __m128i vILimit1 = _mm_sub_epi64(vILimit, vOne); __m128i vJLimit = _mm_set1_epi64x(s2Len); __m128i vJLimit1 = _mm_sub_epi64(vJLimit, vOne); __m128i vIBoundary = _mm_set_epi64x( -open-0*gap, -open-1*gap); /* convert _s1 from char to int in range 0-23 */ for (i=0; i<s1Len; ++i) { s1[i] = matrix->mapper[(unsigned char)_s1[i]]; } /* pad back of s1 with dummy values */ for (i=s1Len; i<s1Len_PAD; ++i) { s1[i] = 0; /* point to first matrix row because we don't care */ }
static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; const int x_out_max = wrk->dst_width * wrk->num_channels; const rescaler_t* const frow = wrk->frow; const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum); assert(!WebPRescalerOutputDone(wrk)); assert(wrk->y_accum <= 0); assert(!wrk->y_expand); if (yscale) { const int scale_xy = wrk->fxy_scale; const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy); const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(B0, rounder); const __m128i C1 = _mm_add_epi64(B1, rounder); const __m128i C2 = _mm_add_epi64(B2, rounder); const __m128i C3 = _mm_add_epi64(B3, rounder); const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); // = frac const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX); const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX); const __m128i E0 = _mm_sub_epi64(A0, D0); // irow[x] - frac const __m128i E1 = _mm_sub_epi64(A1, D1); const __m128i E2 = _mm_sub_epi64(A2, D2); const __m128i E3 = _mm_sub_epi64(A3, D3); const __m128i F2 = _mm_slli_epi64(D2, 32); const __m128i F3 = _mm_slli_epi64(D3, 32); const __m128i G0 = _mm_or_si128(D0, F2); const __m128i G1 = _mm_or_si128(D1, F3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0); _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1); ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale); const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; irow[x_out] = frac; // new fractional start } } else { const uint32_t scale = wrk->fxy_scale; const __m128i mult = _mm_set_epi32(0, scale, 0, scale); const __m128i zero = _mm_setzero_si128(); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero); _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero); ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const int v = (int)MULT_FIX(irow[x_out], scale); assert(v >= 0 && v <= 255); dst[x_out] = v; irow[x_out] = 0; } } }