int test() { int N = 5000 * SIMDBlockSize, gap; __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); uint32_t * datain = malloc(N * sizeof(uint32_t)); uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); for (gap = 1; gap <= 387420489; gap *= 3) { int k; printf(" gap = %u \n", gap); for (k = 0; k < N; ++k) datain[k] = k * gap; for (k = 0; k * SIMDBlockSize < N; ++k) { /* First part works for general arrays (sorted or unsorted) */ int j; /* we compute the bit width */ const uint32_t b = maxbits(datain + k * SIMDBlockSize); /* we read 128 integers at "datain + k * SIMDBlockSize" and write b 128-bit vectors at "buffer" */ simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ simdunpack(buffer, backbuffer, b);/* uncompressed */ for (j = 0; j < SIMDBlockSize; ++j) { if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { printf("bug in simdpack\n"); return -2; } } { /* next part assumes that the data is sorted (uses differential coding) */ uint32_t offset = 0; /* we compute the bit width */ const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize); /* we read 128 integers at "datain + k * SIMDBlockSize" and write b1 128-bit vectors at "buffer" */ simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1); /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ simdunpackd1(offset, buffer, backbuffer, b1); for (j = 0; j < SIMDBlockSize; ++j) { if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { printf("bug in simdpack d1\n"); return -3; } } offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; } } } free(buffer); free(datain); free(backbuffer); printf("Code looks good.\n"); return 0; }
/* Another illustration ... */ void simple_demo() { size_t REPEAT = 10, gap; size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */ uint32_t * datain = malloc(N * sizeof(uint32_t)); size_t compsize; clock_t start, end; uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */ uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); printf("== simple demo\n"); for (gap = 1; gap <= 243; gap *= 3) { size_t k, repeat; uint32_t offset = 0; uint32_t bogus = 0; double numberofseconds; printf("\n"); printf(" gap = %lu \n", (unsigned long) gap); datain[0] = 0; for (k = 1; k < N; ++k) datain[k] = datain[k-1] + ( rand() % (gap + 1) ); compsize = compress(datain,N,buffer); printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); start = clock(); for(repeat = 0; repeat < REPEAT; ++repeat) { uint8_t * decbuffer = buffer; for (k = 0; k * SIMDBlockSize < N; ++k) { uint8_t b = *decbuffer++; simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); /* do something here with backbuffer */ bogus += backbuffer[3]; decbuffer += b * sizeof(__m128i); offset = backbuffer[SIMDBlockSize - 1]; } } end = clock(); numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); start = clock(); for(repeat = 0; repeat < REPEAT; ++repeat) { uint8_t * decbuffer = buffer; for (k = 0; k * SIMDBlockSize < N; ++k) { memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t)); bogus += backbuffer[3] - backbuffer[100]; } } end = clock(); numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); printf("ignore me %i \n",bogus); printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n"); } free(buffer); free(datain); free(backbuffer); }
int test_simdpackedsearch_advanced() { uint32_t buffer[128]; uint32_t backbuffer[128]; uint32_t out[128]; uint32_t result = 0; uint32_t b, i; uint32_t init = 0; __m128i initial = _mm_set1_epi32(init); /* this test creates delta encoded buffers with different bits, then * performs lower bound searches for each key */ for (b = 0; b <= 32; b++) { uint32_t prev = init; /* initialize the buffer */ for (i = 0; i < 128; i++) { buffer[i] = ((uint32_t)(1431655765 * i + 0xFFFFFFFF)) ; if(b < 32) buffer[i] %= (1<<b); } qsort(buffer,128, sizeof(uint32_t), uint32_cmp); for (i = 0; i < 128; i++) { buffer[i] = buffer[i] + prev; prev = buffer[i]; } for (i = 1; i < 128; i++) { if(buffer[i] < buffer[i-1] ) buffer[i] = buffer[i-1]; } assert(simdmaxbitsd1(init, buffer)<=b); for (i = 0; i < 128; i++) { out[i] = 0; /* memset would do too */ } /* delta-encode to 'i' bits */ simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b); simdunpackd1(init, (__m128i *)out, backbuffer, b); for (i = 0; i < 128; i++) { assert(buffer[i] == backbuffer[i]); } printf("advanced simdsearchd1: %d bits\n", b); for (i = 0; i < 128; i++) { int pos; initial = _mm_set1_epi32(init); pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i], &result); assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, buffer[i], &result)); assert(buffer[pos] == buffer[i]); if(pos > 0) assert(buffer[pos - 1] < buffer[i]); assert(result == buffer[i]); } for (i = 0; i < 128; i++) { int pos; if(buffer[i] == 0) continue; initial = _mm_set1_epi32(init); pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] - 1, &result); assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, buffer[i] - 1, &result)); assert(buffer[pos] >= buffer[i] - 1); if(pos > 0) assert(buffer[pos - 1] < buffer[i] - 1); assert(result == buffer[pos]); } for (i = 0; i < 128; i++) { int pos; if (buffer[i] + 1 == 0) continue; initial = _mm_set1_epi32(init); pos = simdsearchd1(&initial, (__m128i *) out, b, buffer[i] + 1, &result); assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, buffer[i] + 1, &result)); if(pos == 128) { assert(buffer[i] == buffer[127]); } else { assert(buffer[pos] >= buffer[i] + 1); if (pos > 0) assert(buffer[pos - 1] < buffer[i] + 1); assert(result == buffer[pos]); } } } printf("advanced simdsearchd1: ok\n"); return 0; }
unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { uint32_t k, *out_; for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, (__m128i *)in, out, b); return (unsigned char *)in; }