/* ------------------------------------------------------------------------- */ int test_sign16s_func(void) { INT16 ALIGN(src[65535]), ALIGN(d1[65535]); #ifdef WITH_SSE2 INT16 ALIGN(d2[65535]); int i; #endif int failed = 0; char testStr[256]; /* Test when we can reach 16-byte alignment */ testStr[0] = '\0'; get_random_data(src, sizeof(src)); general_sign_16s(src+1, d1+1, 65535); #ifdef WITH_SSE2 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) { strcat(testStr, " SSSE3"); ssse3_sign_16s(src+1, d2+1, 65535); for (i=1; i<65535; ++i) { if (d1[i] != d2[i]) { printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n", i, src[i], d1[i], d2[i]); ++failed; } } } #endif /* i386 */ /* Test when we cannot reach 16-byte alignment */ get_random_data(src, sizeof(src)); general_sign_16s(src+1, d1+2, 65535); #ifdef WITH_SSE2 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) { ssse3_sign_16s(src+1, d2+2, 65535); for (i=2; i<65535; ++i) { if (d1[i] != d2[i]) { printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n", i, src[i-1], d1[i], d2[i]); ++failed; } } } #endif /* i386 */ if (!failed) printf("All sign16s tests passed (%s).\n", testStr); return (failed > 0) ? FAILURE : SUCCESS; }
/* ------------------------------------------------------------------------- */ int test_sign16s_func(void) { INT16 ALIGN(src[65535]), ALIGN(d1[65535]), ALIGN(d2[65535]); int failed = 0; int i; UINT32 pflags = primitives_get_flags(primitives_get()); char testStr[256]; /* Test when we can reach 16-byte alignment */ testStr[0] = '\0'; get_random_data(src, sizeof(src)); general_sign_16s(src+1, d1+1, 65535); #ifdef _M_IX86_AMD64 if (pflags & PRIM_X86_SSSE3_AVAILABLE) { strcat(testStr, " SSSE3"); ssse3_sign_16s(src+1, d2+1, 65535); for (i=1; i<65535; ++i) { if (d1[i] != d2[i]) { printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n", i, src[i], d1[i], d2[i]); ++failed; } } } #endif /* i386 */ /* Test when we cannot reach 16-byte alignment */ get_random_data(src, sizeof(src)); general_sign_16s(src+1, d1+2, 65535); #ifdef _M_IX86_AMD64 if (pflags & PRIM_X86_SSSE3_AVAILABLE) { ssse3_sign_16s(src+1, d2+2, 65535); for (i=2; i<65535; ++i) { if (d1[i] != d2[i]) { printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n", i, src[i-1], d1[i], d2[i]); ++failed; } } } #endif /* i386 */ if (!failed) printf("All sign16s tests passed (%s).\n", testStr); return (failed > 0) ? FAILURE : SUCCESS; }
if (d1[i] != d2[i]) { printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n", i, src[i-1], d1[i], d2[i]); ++failed; } } } #endif /* i386 */ if (!failed) printf("All sign16s tests passed (%s).\n", testStr); return (failed > 0) ? FAILURE : SUCCESS; } /* ------------------------------------------------------------------------- */ STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst=dst, TRUE, general_sign_16s(src1, dst, size), TRUE, ssse3_sign_16s(src1, dst, size), PRIM_X86_SSSE3_AVAILABLE, FALSE, dst=dst, 0, FALSE, dst=dst); int test_sign16s_speed(void) { INT16 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]); get_random_data(src, sizeof(src)); sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst, test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); sign16s_speed_test("sign16s", "unaligned", src+1, NULL, 0, dst, test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); return SUCCESS; }
/* ------------------------------------------------------------------------- */ pstatus_t ssse3_sign_16s( const INT16 *pSrc, INT16 *pDst, INT32 len) { const INT16 *sptr = (const INT16 *) pSrc; INT16 *dptr = (INT16 *) pDst; size_t count; if (len < 16) { return general_sign_16s(pSrc, pDst, len); } /* Check for 16-byte alignment (eventually). */ if ((ULONG_PTR) pDst & 0x01) { return general_sign_16s(pSrc, pDst, len); } /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { INT16 src = *sptr++; *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); if (--len == 0) return PRIMITIVES_SUCCESS; } /* Do 32-short chunks using 8 XMM registers. */ count = len >> 5; /* / 32 */ len -= count << 5; /* * 32 */ if ((ULONG_PTR) sptr & 0x0f) { /* Unaligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } else { /* Aligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } /* Do 8-short chunks using two XMM registers. */ count = len >> 3; len -= count << 3; while (count--) { __m128i xmm0 = _mm_set1_epi16(0x0001U); __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm1); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; } /* Do leftovers. */ while (len--) { INT16 src = *sptr++; *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); } return PRIMITIVES_SUCCESS; }