/* Shuffle a block. This can never fail. */ void shuffle_avx2(const size_t bytesoftype, const size_t blocksize, const uint8_t* const _src, uint8_t* const _dest) { const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i); /* If the block size is too small to be vectorized, use the generic implementation. */ if (blocksize < vectorized_chunk_size) { shuffle_generic(bytesoftype, blocksize, _src, _dest); return; } /* If the blocksize is not a multiple of both the typesize and the vector size, round the blocksize down to the next value which is a multiple of both. The vectorized shuffle can be used for that portion of the data, and the naive implementation can be used for the remaining portion. */ const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size); const size_t vectorizable_elements = vectorizable_bytes / bytesoftype; const size_t total_elements = blocksize / bytesoftype; /* Optimized shuffle implementations */ switch (bytesoftype) { case 2: shuffle2_avx2(_dest, _src, vectorizable_elements, total_elements); break; case 4: shuffle4_avx2(_dest, _src, vectorizable_elements, total_elements); break; case 8: shuffle8_avx2(_dest, _src, vectorizable_elements, total_elements); break; case 16: shuffle16_avx2(_dest, _src, vectorizable_elements, total_elements); break; default: /* For types larger than 16 bytes, use the AVX2 tiled shuffle. */ if (bytesoftype > sizeof(__m128i)) { shuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype); } else { /* Non-optimized shuffle */ shuffle_generic(bytesoftype, blocksize, _src, _dest); /* The non-optimized function covers the whole buffer, so we're done processing here. */ return; } } /* If the buffer had any bytes at the end which couldn't be handled by the vectorized implementations, use the non-optimized version to finish them up. */ if (vectorizable_bytes < blocksize) { shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest); } }
/** Roundtrip tests for the SSE2-accelerated shuffle/unshuffle. */ static int test_shuffle_roundtrip_sse2(size_t type_size, size_t num_elements, size_t buffer_alignment, int test_type) { #if defined(SHUFFLE_SSE2_ENABLED) size_t buffer_size = type_size * num_elements; /* Allocate memory for the test. */ void* original = blosc_test_malloc(buffer_alignment, buffer_size); void* shuffled = blosc_test_malloc(buffer_alignment, buffer_size); void* unshuffled = blosc_test_malloc(buffer_alignment, buffer_size); /* Fill the input data buffer with random values. */ blosc_test_fill_random(original, buffer_size); /* Shuffle/unshuffle, selecting the implementations based on the test type. */ switch(test_type) { case 0: /* sse2/sse2 */ shuffle_sse2(type_size, buffer_size, original, shuffled); unshuffle_sse2(type_size, buffer_size, shuffled, unshuffled); break; case 1: /* generic/sse2 */ shuffle_generic(type_size, buffer_size, original, shuffled); unshuffle_sse2(type_size, buffer_size, shuffled, unshuffled); break; case 2: /* sse2/generic */ shuffle_sse2(type_size, buffer_size, original, shuffled); unshuffle_generic(type_size, buffer_size, shuffled, unshuffled); break; default: fprintf(stderr, "Invalid test type specified (%d).", test_type); return EXIT_FAILURE; } /* The round-tripped data matches the original data when the result of memcmp is 0. */ int exit_code = memcmp(original, unshuffled, buffer_size) ? EXIT_FAILURE : EXIT_SUCCESS; /* Free allocated memory. */ blosc_test_free(original); blosc_test_free(shuffled); blosc_test_free(unshuffled); return exit_code; #else return EXIT_SUCCESS; #endif /* defined(SHUFFLE_SSE2_ENABLED) */ }