Ejemplo n.º 1
0
/* Unshuffle a block.  This can never fail. */
void
unshuffle_avx2(const size_t bytesoftype, const size_t blocksize,
               const uint8_t* const _src, uint8_t* const _dest) {
  const size_t vectorized_chunk_size = bytesoftype * sizeof(__m256i);

  /* If the block size is too small to be vectorized,
     use the generic implementation. */
  if (blocksize < vectorized_chunk_size) {
    unshuffle_generic(bytesoftype, blocksize, _src, _dest);
    return;
  }

  /* If the blocksize is not a multiple of both the typesize and
     the vector size, round the blocksize down to the next value
     which is a multiple of both. The vectorized unshuffle can be
     used for that portion of the data, and the naive implementation
     can be used for the remaining portion. */
  const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);

  const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
  const size_t total_elements = blocksize / bytesoftype;

  /* Optimized unshuffle implementations */
  switch (bytesoftype)
  {
  case 2:
    unshuffle2_avx2(_dest, _src, vectorizable_elements, total_elements);
    break;
  case 4:
    unshuffle4_avx2(_dest, _src, vectorizable_elements, total_elements);
    break;
  case 8:
    unshuffle8_avx2(_dest, _src, vectorizable_elements, total_elements);
    break;
  case 16:
    unshuffle16_avx2(_dest, _src, vectorizable_elements, total_elements);
    break;
  default:
    /* For types larger than 16 bytes, use the AVX2 tiled unshuffle. */
    if (bytesoftype > sizeof(__m128i)) {
      unshuffle16_tiled_avx2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
    }
    else {
      /* Non-optimized unshuffle */
      unshuffle_generic(bytesoftype, blocksize, _src, _dest);
      /* The non-optimized function covers the whole buffer,
         so we're done processing here. */
      return;
    }
  }

  /* If the buffer had any bytes at the end which couldn't be handled
     by the vectorized implementations, use the non-optimized version
     to finish them up. */
  if (vectorizable_bytes < blocksize) {
    unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
  }
}
/** Roundtrip tests for the SSE2-accelerated shuffle/unshuffle. */
static int test_shuffle_roundtrip_sse2(size_t type_size, size_t num_elements,
  size_t buffer_alignment, int test_type)
{
#if defined(SHUFFLE_SSE2_ENABLED)
  size_t buffer_size = type_size * num_elements;

  /* Allocate memory for the test. */
  void* original = blosc_test_malloc(buffer_alignment, buffer_size);
  void* shuffled = blosc_test_malloc(buffer_alignment, buffer_size);
  void* unshuffled = blosc_test_malloc(buffer_alignment, buffer_size);

  /* Fill the input data buffer with random values. */
  blosc_test_fill_random(original, buffer_size);

  /* Shuffle/unshuffle, selecting the implementations based on the test type. */
  switch(test_type)
  {
    case 0:
      /* sse2/sse2 */
      shuffle_sse2(type_size, buffer_size, original, shuffled);
      unshuffle_sse2(type_size, buffer_size, shuffled, unshuffled);
      break;
    case 1:
      /* generic/sse2 */
      shuffle_generic(type_size, buffer_size, original, shuffled);
      unshuffle_sse2(type_size, buffer_size, shuffled, unshuffled);
      break;
    case 2:
      /* sse2/generic */
      shuffle_sse2(type_size, buffer_size, original, shuffled);
      unshuffle_generic(type_size, buffer_size, shuffled, unshuffled);
      break;
    default:
      fprintf(stderr, "Invalid test type specified (%d).", test_type);
      return EXIT_FAILURE;
  }

  /* The round-tripped data matches the original data when the
     result of memcmp is 0. */
  int exit_code = memcmp(original, unshuffled, buffer_size) ?
    EXIT_FAILURE : EXIT_SUCCESS;

  /* Free allocated memory. */
  blosc_test_free(original);
  blosc_test_free(shuffled);
  blosc_test_free(unshuffled);

  return exit_code;
#else
  return EXIT_SUCCESS;
#endif /* defined(SHUFFLE_SSE2_ENABLED) */
}