int main(int argc, char ** argv) {
  const int N = 1024 * 32;
  int alignoffset = 0;
  if(argc>1)  {
    alignoffset = atoi(argv[1]);
    printf("alignment offset = %d \n", alignoffset);
  }
  char *origbuffer = malloc(N + alignoffset);
  char *origtmpbuffer = malloc(N + alignoffset);
  char *buffer = origbuffer + alignoffset;
  char *tmpbuffer = origtmpbuffer + alignoffset;
  printf("pointer alignment = %d bytes \n", 1<< __builtin_ctzll((uintptr_t)(const void *)(buffer)));

  int repeat = 100;
  size_t howmanywhite = fillwithtext(buffer, N);

  BEST_TIME_NOCHECK_NOPRE(memcpy(tmpbuffer,buffer,N),
                   repeat, N);
  printf("\n");
  BEST_TIME(despace(buffer, N), N - howmanywhite,
                  howmanywhite = fillwithtext(buffer, N), repeat, N);
  BEST_TIME(neon_despace(buffer, N), N - howmanywhite,
                  howmanywhite = fillwithtext(buffer, N), repeat, N);
  BEST_TIME(neon_despace_branchless(buffer, N), N - howmanywhite,
                  howmanywhite = fillwithtext(buffer, N), repeat, N);
  BEST_TIME(neontbl_despace(buffer, N), N - howmanywhite,
                  howmanywhite = fillwithtext(buffer, N), repeat, N);
  free(origbuffer);
  free(origtmpbuffer);
}
void demo(int size) {
    printf("Shuffling arrays of size %d \n",size);
    printf("Time reported in number of cycles per array element.\n");
    printf("Tests assume that array is in cache as much as possible.\n");
    int repeat = 500;
    uint32_t * testvalues = create_random_array(size);
    uint32_t * pristinecopy = malloc(size * sizeof(uint32_t));
    memcpy(pristinecopy,testvalues,sizeof(uint32_t) * size);
    if(sortAndCompare(testvalues, pristinecopy, size)!=0) return;

    BEST_TIME(shuffle_pcg(testvalues,size), array_cache_prefetch(testvalues,size), repeat, size);
    if(sortAndCompare(testvalues, pristinecopy, size)!=0) return;

    BEST_TIME(shuffle_pcg_go(testvalues,size), array_cache_prefetch(testvalues,size), repeat, size);
    if(sortAndCompare(testvalues, pristinecopy, size)!=0) return;

    BEST_TIME(shuffle_pcg_java(testvalues,size), array_cache_prefetch(testvalues,size), repeat, size);
    if(sortAndCompare(testvalues, pristinecopy, size)!=0) return;

    BEST_TIME(shuffle_pcg_divisionless(testvalues,size), array_cache_prefetch(testvalues,size), repeat, size);
    if(sortAndCompare(testvalues, pristinecopy, size)!=0) return;

    BEST_TIME(shuffle_pcg_divisionless_with_slight_bias(testvalues,size), array_cache_prefetch(testvalues,size), repeat, size);
    if(sortAndCompare(testvalues, pristinecopy, size)!=0) return;

    free(testvalues);
    free(pristinecopy);
    printf("\n");
}
int main() {
  uint64_t sumofmods = 0;
  const uint64_t maxval = 1000000;
  for(uint64_t k = 0; k < maxval; ++k) sumofmods += ( k + sumofmods )  % 23;
  const int repeat = 5;
  BEST_TIME(sumofmod23(maxval), sumofmods, repeat, maxval) ;
  BEST_TIME(fastsumofmod23(maxval), sumofmods, repeat, maxval) ;



  for(uint64_t x = 1; x !=0; x++) {
    if(mod23(x) != fastmod23(x)) printf("%x\n",x);
  }
}
void array_shifting() {
  printf("[array shifting]\n");
  int repeat = 5;
  size_t N = 1000;
  uint32_t *array = malloc(N * sizeof(uint32_t));
  randominit(array, N);
  const int shiftamount = 3;
  BEST_TIME(scalarshift(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);
  BEST_TIME(vectorshift(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);
  BEST_TIME(vectorshift_unrolled(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);

  BEST_TIME(varvectorshift(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);
  BEST_TIME(varvectorshift_unrolled(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);

  BEST_TIME(variablevectorshift(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);
  BEST_TIME(variablevectorshift_unrolled(array, N, shiftamount), 0, randominit(array, N), repeat,
      N, true);
  free(array);
}
void demo(uint32_t N) {
  printf("N = %d\n", N);
  uint32_t * z = malloc(N * sizeof(uint32_t));
  for(uint32_t i = 0 ; i < N; ++i) z[i] = rand(); // some rand. number
  uint32_t nmbr = 500;
  uint32_t * accesses = malloc(nmbr * sizeof(uint32_t));
  for(uint32_t i = 0 ; i < nmbr; ++i) accesses[i] = rand(); // some rand. number
  uint32_t expected1 = modsum(z,N,accesses,nmbr);
  uint32_t expected2 = fastsum(z,N,accesses,nmbr);
  BEST_TIME(modsum(z,N,accesses,nmbr), expected1, 1000, nmbr);
  BEST_TIME(fastsum(z,N,accesses,nmbr), expected2, 1000, nmbr);
#ifdef __AVX2__
  uint32_t expected3 = vectorsum(z,N,accesses,nmbr);
  if(N % 4 == 0) BEST_TIME(vectorsum(z,N,accesses,nmbr), expected3, 1000, nmbr);
  uint32_t expected4 = maskedvectorsum(z,N,accesses,nmbr);
  if(N % 8 == 0) BEST_TIME(maskedvectorsum(z,N,accesses,nmbr), expected4, 1000, nmbr);
#endif
  free(z);
  free(accesses);
}
void demo(size_t N) {
  printf("string size = %zu \n", N);
  char *data = (char *)malloc(N);
  bool expected = true; // it is all ascii?
  int repeat = 5;
  printf("We are feeding ascii so it is always going to be ok.\n");
  BEST_TIME(is_ascii(data, N), expected,populate(data,N) , repeat, N, true);

  BEST_TIME(validate_utf8(data, N), expected,populate(data,N) , repeat, N, true);
  BEST_TIME(validate_utf8_branchless(data, N), expected,populate(data,N) , repeat, N, true);
  BEST_TIME(validate_utf8_double(data, N), expected,populate(data,N) , repeat, N, true);

  BEST_TIME(shiftless_validate_utf8(data, N), expected,populate(data,N) , repeat, N, true);
  BEST_TIME(shiftless_validate_utf8_branchless(data, N), expected,populate(data,N) , repeat, N, true);
  BEST_TIME(shiftless_validate_utf8_double(data, N), expected,populate(data,N) , repeat, N, true);


  BEST_TIME(validate_utf8_sse_nocheating(data, N), expected,populate(data,N) , repeat, N, true);
  BEST_TIME(validate_utf8_sse(data, N), expected,populate(data,N) , repeat, N, true);
  free(data);
}
int main() {
    int repeat = 500;
    int size = TESTSIZE;
    tellmeall();
    printf("array container benchmarks\n");
    array_container_t* B = array_container_create();
    BEST_TIME(add_test(B), 0, repeat, size);
    int answer = contains_test(B);
    size = 1 << 16;
    BEST_TIME(contains_test(B), answer, repeat, size);

    size = (1 << 16) / 3;
    BEST_TIME(remove_test(B), 0, repeat, size);
    array_container_free(B);

    for (int howmany = 32; howmany <= (1 << 16); howmany *= 8) {
        array_container_t* Bt = array_container_create();
        for (int j = 0; j < howmany; ++j) {
            array_container_add(Bt, (uint16_t)pcg32_random());
        }
        size_t nbrtestvalues = 1024;
        uint16_t* testvalues = malloc(nbrtestvalues * sizeof(uint16_t));
        printf("\n number of values in container = %d\n", Bt->cardinality);
        int card = array_container_cardinality(Bt);
        uint32_t* out = malloc(sizeof(uint32_t) * (unsigned long)card);
        BEST_TIME(array_container_to_uint32_array(out, Bt, 1234), card, repeat,
                  card);
        free(out);
        BEST_TIME_PRE_ARRAY(Bt, array_container_contains, array_cache_prefetch,
                            testvalues, nbrtestvalues);
        BEST_TIME_PRE_ARRAY(Bt, array_container_contains, array_cache_flush,
                            testvalues, nbrtestvalues);
        free(testvalues);
        array_container_free(Bt);
    }
    printf("\n");

    array_container_t* B1 = array_container_create();
    for (int x = 0; x < 1 << 16; x += 3) {
        array_container_add(B1, (uint16_t)x);
    }
    array_container_t* B2 = array_container_create();
    for (int x = 0; x < 1 << 16; x += 5) {
        array_container_add(B2, (uint16_t)x);
    }
    int32_t inputsize = B1->cardinality + B2->cardinality;
    array_container_t* BO = array_container_create();
    printf("\nUnion and intersections...\n");
    printf("\nNote:\n");
    printf(
        "union times are expressed in cycles per number of input elements "
        "(both arrays)\n");
    printf(
        "intersection times are expressed in cycles per number of output "
        "elements\n\n");
    printf("==intersection and union test 1 \n");
    printf("input 1 cardinality = %d, input 2 cardinality = %d \n",
           B1->cardinality, B2->cardinality);
    answer = union_test(B1, B2, BO);
    printf("union cardinality = %d \n", answer);
    printf("B1 card = %d B2 card = %d \n", B1->cardinality, B2->cardinality);
    BEST_TIME(union_test(B1, B2, BO), answer, repeat, inputsize);
    answer = intersection_test(B1, B2, BO);
    printf("intersection cardinality = %d \n", answer);
    BEST_TIME(intersection_test(B1, B2, BO), answer, repeat, answer);
    printf("==intersection and union test 2 \n");
    array_container_clear(B1);
    array_container_clear(B2);
    for (int x = 0; x < 1 << 16; x += 16) {
        array_container_add(B1, (uint16_t)x);
    }
    for (int x = 1; x < 1 << 16; x += x) {
        array_container_add(B2, (uint16_t)x);
    }
    printf("input 1 cardinality = %d, input 2 cardinality = %d \n",
           B1->cardinality, B2->cardinality);
    answer = union_test(B1, B2, BO);
    printf("union cardinality = %d \n", answer);
    printf("B1 card = %d B2 card = %d \n", B1->cardinality, B2->cardinality);
    BEST_TIME(union_test(B1, B2, BO), answer, repeat, inputsize);
    answer = intersection_test(B1, B2, BO);
    printf("intersection cardinality = %d \n", answer);
    BEST_TIME(intersection_test(B1, B2, BO), answer, repeat, answer);

    array_container_free(B1);
    array_container_free(B2);
    array_container_free(BO);
    return 0;
}
int main() {
    int repeat = 500;
    int size = (1 << 16) / 3;
    tellmeall();
    printf("bitset container benchmarks\n");
    bitset_container_t* B = bitset_container_create();
    BEST_TIME(set_test(B), 0, repeat, size);
    int answer = get_test(B);
    size = 1 << 16;
    BEST_TIME(get_test(B), answer, repeat, size);
    BEST_TIME(bitset_container_cardinality(B), answer, repeat, 1);
    BEST_TIME(bitset_container_compute_cardinality(B), answer, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);

    size = (1 << 16) / 3;
    BEST_TIME(unset_test(B), 0, repeat, size);
    bitset_container_free(B);

    for (int howmany = 4096; howmany <= (1 << 16); howmany *= 2) {
        bitset_container_t* Bt = bitset_container_create();
        while (bitset_container_cardinality(Bt) < howmany) {
            bitset_container_set(Bt, (uint16_t)pcg32_random());
        }
        size_t nbrtestvalues = 1024;
        uint16_t* testvalues = malloc(nbrtestvalues * sizeof(uint16_t));
        printf("\n number of values in container = %d\n",
               bitset_container_cardinality(Bt));
        int card = bitset_container_cardinality(Bt);
        uint32_t* out = malloc(sizeof(uint32_t) * (unsigned)card + 32);
        BEST_TIME(bitset_container_to_uint32_array(out, Bt, 1234), card, repeat,
                  card);
        free(out);
        BEST_TIME_PRE_ARRAY(Bt, bitset_container_get, bitset_cache_prefetch,
                            testvalues, nbrtestvalues);
        BEST_TIME_PRE_ARRAY(Bt, bitset_container_get, bitset_cache_flush,
                            testvalues, nbrtestvalues);
        free(testvalues);
        bitset_container_free(Bt);
    }
    printf("\n");

    bitset_container_t* B1 = bitset_container_create();
    for (int x = 0; x < 1 << 16; x += 3) {
        bitset_container_set(B1, (uint16_t)x);
    }
    bitset_container_t* B2 = bitset_container_create();
    for (int x = 0; x < 1 << 16; x += 5) {
        bitset_container_set(B2, (uint16_t)x);
    }
    bitset_container_t* BO = bitset_container_create();
    BEST_TIME(bitset_container_or_nocard(B1, B2, BO), -1, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);
    answer = bitset_container_compute_cardinality(BO);
    BEST_TIME(bitset_container_or(B1, B2, BO), answer, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);
    BEST_TIME(bitset_container_cardinality(BO), answer, repeat, 1);
    BEST_TIME(bitset_container_compute_cardinality(BO), answer, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);
    BEST_TIME(bitset_container_and_nocard(B1, B2, BO), -1, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);
    answer = bitset_container_compute_cardinality(BO);
    BEST_TIME(bitset_container_and(B1, B2, BO), answer, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);
    BEST_TIME(bitset_container_cardinality(BO), answer, repeat, 1);
    BEST_TIME(bitset_container_compute_cardinality(BO), answer, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);

    // next we are going to benchmark conversion from bitset to array (an
    // important step)
    bitset_container_clear(B1);
    for (int k = 0; k < 4096; ++k) {
        bitset_container_set(B1, (uint16_t)ranged_random(1 << 16));
    }
    answer = get_cardinality_through_conversion_to_array(B1);
    BEST_TIME(get_cardinality_through_conversion_to_array(B1), answer, repeat,
              BITSET_CONTAINER_SIZE_IN_WORDS);

    bitset_container_free(BO);
    bitset_container_free(B1);
    bitset_container_free(B2);
    return 0;
}