int
main(int argc, char **argv) {
    const char * mode = NULL;

    SHF_ASSERT(NULL != setlocale(LC_NUMERIC, ""), "setlocale(): %u: ", errno);

    if (argc >= 2) {
        SHF_ASSERT((0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("c2js")))
        ||         (0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("c2c" )))
        ||         (0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("4c"  ))), "ERROR: please supply an argument; c2js, c2py, c2c, or 4c; got: '%s'", argv[1]);
    }

         if (argc > 1 && 0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("c2js"))) { plan_tests( 6); mode = strdup(argv[1]); }
    else if (argc > 1 && 0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("c2py"))) { plan_tests( 5); mode = strdup(argv[1]); }
    else if (argc > 1 && 0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("c2c" ))) { plan_tests( 9); mode = strdup(argv[1]); }
    else if (argc > 1 && 0 == memcmp(argv[1], SHF_CONST_STR_AND_SIZE("4c"  ))) { plan_tests( 7); mode = strdup(argv[1]); }
    else                                                                       { plan_tests(10); mode = "c2c"          ; } /* default if no arguments */

    pid_t pid = getpid();
    SHF_DEBUG("pid %u started; mode is '%s'\n", pid, mode);

    if (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2js"))) { /* just for fun, test C to C call speed; useful for comparing to V8 to C call speed */
        double test_start_time = shf_get_time_in_seconds();
        double test_iterations = 0;
        do {
            test_iterations += test_dummy();
        } while (test_iterations < 10000000);
        double test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
        ok(1, "   c2*: called  expected number to dummy function  // estimate %'.0f calls per second", test_iterations / test_elapsed_time);
    }

    char  test_shf_name[256];
    char  test_shf_folder[] = "/dev/shm";
    SHF_SNPRINTF(1, test_shf_name, "test-%05u-ipc-queue", pid);

    SHF      * shf;
    uint32_t   uid;
    uint32_t   test_keys = 100000;

    if (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("4c"))) {
        SHF_ASSERT(argc == 3, "ERROR: please supply arguments; 4c <name of shf>");

              shf_debug_verbosity_less();
              shf_init                ();
        shf = shf_attach_existing     (test_shf_folder, argv[2]); ok(NULL != shf, "    4c: shf_attach_existing() works for existing file as expected");
              shf_log_attach_existing (shf                     );

        shf_debug_verbosity_more(); SHF_DEBUG("'4c' mode; behaving as client\n"); shf_debug_verbosity_less();

        char     * test_q_items_addr  = shf_q_get(shf); SHF_UNUSE(test_q_items_addr); /* todo: this test doesn't actually manipulate the item itself */
        uint32_t   test_qid_free      = shf_q_get_name(shf, SHF_CONST_STR_AND_SIZE("qid-free"));
        uint32_t   test_qid_a2b       = shf_q_get_name(shf, SHF_CONST_STR_AND_SIZE("qid-a2b" ));
        uint32_t   test_qid_b2a       = shf_q_get_name(shf, SHF_CONST_STR_AND_SIZE("qid-b2a" ));
        ok(        test_qid_free     != SHF_QID_NONE, "    4c: shf_q_get_name('qid-free') returned qid as expected");
        ok(        test_qid_a2b      != SHF_QID_NONE, "    4c: shf_q_get_name('qid-a2b' ) returned qid as expected");
        ok(        test_qid_b2a      != SHF_QID_NONE, "    4c: shf_q_get_name('qid-b2a' ) returned qid as expected");

        shf_race_start(shf, SHF_CONST_STR_AND_SIZE("test-q-race-line"), 2);
        shf_debug_verbosity_more(); SHF_DEBUG("testing process b IPC queue a2b --> b2a speed\n"); shf_debug_verbosity_less();
        {
            uint32_t test_pull_items = 0;
            double   test_start_time = shf_get_time_in_seconds();
            shf_qiid = SHF_QIID_NONE;
            while(1) {
                while(SHF_QIID_NONE != shf_q_push_head_pull_tail(shf, test_qid_b2a, shf_qiid, test_qid_a2b)) {
                                       test_pull_items ++;
                }
                if (test_pull_items >= 1000000) { goto FINISH_LINE_4C; }
            }
FINISH_LINE_4C:;
            double test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
            ok(1, "    4c: moved   expected number of new queue items // estimate %'.0f q items per second with contention", test_pull_items / test_elapsed_time);
        }

        {
            shf_debug_verbosity_more(); SHF_DEBUG("testing process b IPC lock speed\n"); shf_debug_verbosity_less();

                                    SHF_MAKE_HASH       ("lock");
            SHF_LOCK * lock  =      shf_get_key_val_addr(shf   );
            ok(        lock != NULL                             , "    4c: got lock value address as expected");

            shf_race_start(shf, SHF_CONST_STR_AND_SIZE("test-lock-race-line"), 2);
            double test_start_time = shf_get_time_in_seconds();
            double test_lock_iterations = 0;
            do {
                SHF_LOCK_WRITER(lock);
                SHF_UNLOCK_WRITER(lock);
                test_lock_iterations ++;
            } while (test_lock_iterations < 2000000);
            double test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
            ok(1, "    4c: rw lock expected number of times           // estimate %'.0f locks per second; with contention", test_lock_iterations / test_elapsed_time);
        }

        shf_debug_verbosity_more(); SHF_DEBUG("ending child\n"); shf_debug_verbosity_less();

        shf_detach(shf);

        goto EARLY_OUT;
    } /* 4c */

          shf_debug_verbosity_less();
          shf_init                ();
          shf_set_data_need_factor(1);
    shf = shf_attach              (test_shf_folder, test_shf_name, 1 /* delete upon process exit */); ok(NULL != shf, "   c2*: shf_attach()          works for non-existing file as expected");
          shf_log_thread_new      (shf, 0 /* use default log buffer size */, STDOUT_FILENO);

    {
        shf_race_init(shf, SHF_CONST_STR_AND_SIZE("test-q-race-line"   ));
        shf_race_init(shf, SHF_CONST_STR_AND_SIZE("test-lock-race-line"));

        if (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2c"))) {
                      SHF_MAKE_HASH  (                     "lock");
               uid =  shf_put_key_val(shf, NULL, sizeof(SHF_LOCK));
            ok(uid != SHF_UID_NONE                                , "   c2*: put lock in value as expected");
        }
    }

    uint32_t test_qs          = 3;
    uint32_t test_q_items     = 100000;
    uint32_t test_q_item_size = 4096;
    ok(      NULL            != shf_q_new     (shf, test_qs, test_q_items, test_q_item_size, 1000), "   c2*: shf_q_new() returned as expected");
    uint32_t test_qid_free    = shf_q_new_name(shf, SHF_CONST_STR_AND_SIZE("qid-free")           );
    uint32_t test_qid_a2b     = shf_q_new_name(shf, SHF_CONST_STR_AND_SIZE("qid-a2b" )           );
    uint32_t test_qid_b2a     = shf_q_new_name(shf, SHF_CONST_STR_AND_SIZE("qid-b2a" )           );

    {
        double   test_start_time = shf_get_time_in_seconds();
        uint32_t test_pull_items = 0;
        while(SHF_QIID_NONE != shf_q_pull_tail(shf, test_qid_free          )) {
                               shf_q_push_head(shf, test_qid_a2b , shf_qiid);
                               test_pull_items ++;
        }
        double test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
        ok(test_q_items == test_pull_items, "   c2*: moved   expected number of new queue items // estimate %'.0f q items per second without contention", test_keys / test_elapsed_time);
    }

    pid_t child_pid = 0;
    if      (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2js"))) { child_pid = shf_exec_child(shf_backticks("which node || which nodejs"), shf_backticks("which node || which nodejs"), "TestIpcQueue.js", test_shf_name); }
    else if (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2py"))) { child_pid = shf_exec_child(shf_backticks("which python"              ), "python"                                   , "TestIpcQueue.py", test_shf_name); }
    else if (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2c" ))) { child_pid = shf_exec_child(shf_backticks("which test.q.shf.t"        ), "test.q.shf.t"                             , "4c"             , test_shf_name); }
    else                                                        { SHF_ASSERT(0, "ERROR: should never get here!"); }

    shf_race_start(shf, SHF_CONST_STR_AND_SIZE("test-q-race-line"), 2);
    shf_debug_verbosity_more(); SHF_DEBUG("testing process a IPC queue b2a --> a2b speed\n"); shf_debug_verbosity_less();
    {
        double   test_start_time = shf_get_time_in_seconds();
        uint32_t test_pull_items = 0;
        shf_qiid = SHF_QIID_NONE;
        while (1) {
            while(SHF_QIID_NONE != shf_q_push_head_pull_tail(shf, test_qid_a2b, shf_qiid, test_qid_b2a)) {
                                   test_pull_items ++;
                if (1000000     == test_pull_items) { shf_q_push_head_pull_tail(shf, test_qid_a2b, shf_qiid, test_qid_b2a); goto FINISH_LINE_C2; }
            }
            if ((0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2js")))
            ||  (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2py")))) { /* the rw spin locks are fair but don't create unnecessary contention for javascript or python client */
                usleep(1000); /* 1/1000th of a second */
            }
        }
FINISH_LINE_C2:;
        double test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
        usleep(3000); /* hack: wait 3/1000th of a second so that the oks do not conflict */
        ok(1, "   c2*: moved   expected number of new queue items // estimate %'.0f q items per second with contention", test_pull_items / test_elapsed_time);
    }

    if (0 == memcmp(mode, SHF_CONST_STR_AND_SIZE("c2c"))) {
        shf_debug_verbosity_more(); SHF_DEBUG("testing process a IPC lock speed\n"); shf_debug_verbosity_less();

                                SHF_MAKE_HASH       ("lock");
        SHF_LOCK * lock  =      shf_get_key_val_addr(shf   );
        ok(        lock != NULL                             , "   c2*: got lock value address as expected");

        shf_race_start(shf, SHF_CONST_STR_AND_SIZE("test-lock-race-line"), 2);
        double test_start_time = shf_get_time_in_seconds();
        double test_lock_iterations = 0;
        do {
            SHF_LOCK_WRITER(lock);
            SHF_UNLOCK_WRITER(lock);
            test_lock_iterations ++;
        } while (test_lock_iterations < 2000000);
        double test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
        ok(1, "   c2*: rw lock expected number of times           // estimate %'.0f locks per second; with contention", test_lock_iterations / test_elapsed_time);

        SHF_LOCK lock_without_contention;
        memset(&lock_without_contention, 0, sizeof(lock_without_contention));
        test_start_time = shf_get_time_in_seconds();
        test_lock_iterations = 0;
        do {
            SHF_LOCK_WRITER(&lock_without_contention);
            SHF_UNLOCK_WRITER(&lock_without_contention);
            test_lock_iterations ++;
        } while (test_lock_iterations < 2000000);
        test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
        ok(1, "   c2*: rw lock expected number of times           // estimate %'.0f locks per second; without contention", test_lock_iterations / test_elapsed_time);

        test_start_time = shf_get_time_in_seconds();
        test_lock_iterations = 0;
        do {
            test_lock_iterations ++;
        } while (test_lock_iterations < 2000000);
        test_elapsed_time = shf_get_time_in_seconds() - test_start_time;
        ok(1, "   c2*: rw lock expected number of times           // estimate %'.0f locks per second; without lock, just loop", test_lock_iterations / test_elapsed_time);
    }

    shf_debug_verbosity_more();

    int status;
    waitpid(child_pid, &status, 0);

    ok(1, "   c2*: test still alive");

    SHF_DEBUG("ending parent\n");

    SHF_PLAIN("test: shf size before deletion: %s\n", shf_del(shf));

EARLY_OUT:;

    return exit_status();
} /* main() */
int main(void)
{
    plan_tests(1);

    if (getenv("SHF_PERFORMANCE_TEST_ENABLE") && atoi(getenv("SHF_PERFORMANCE_TEST_ENABLE"))) {
    }
    else {
        fprintf(stderr, "NOTE: prefix make with SHF_PERFORMANCE_TEST_ENABLE=1 ?\n");
        goto EARLY_EXIT;
    }

    uint32_t cpu_count_desired = getenv("SHF_PERFORMANCE_TEST_CPUS") ? atoi(getenv("SHF_PERFORMANCE_TEST_CPUS")) : 0;
    uint32_t test_keys_desired = getenv("SHF_PERFORMANCE_TEST_KEYS") ? atoi(getenv("SHF_PERFORMANCE_TEST_KEYS")) : 0;

    TEST_INIT();

#define TEST_MAX_PROCESSES (16)

#ifdef TEST_LMDB
             uint32_t   test_keys_default = 100 * 1000000; /* assume enough RAM is available for LMDB */
#else
             uint64_t   vfs_available_md  = shf_get_vfs_available(shf) / 1024 / 1024;
             uint32_t   test_keys_10m     = vfs_available_md / 436 * 10; /* 10M keys is about 436MB */
             uint32_t   test_keys_default = test_keys_10m > 100 ? 100 * 1000000 : test_keys_10m * 1000000; SHF_ASSERT(test_keys_default > 0, "ERROR: only %luMB available on /dev/shm but 10M keys takes at least 436MB for SharedHashFile", vfs_available_md);
#endif
             uint32_t   test_keys         = test_keys_desired ? test_keys_desired : test_keys_default;
             uint32_t   cpu_count         = cpu_count_desired ? cpu_count_desired : test_get_cpu_count();
             uint32_t   process;
             uint32_t   processes = cpu_count > TEST_MAX_PROCESSES ? TEST_MAX_PROCESSES : cpu_count;
             uint32_t   counts_old[TEST_MAX_PROCESSES] = { 0 };
    volatile uint32_t * put_counts = mmap(NULL, SHF_MOD_PAGE(TEST_MAX_PROCESSES*sizeof(uint32_t)), PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_NORESERVE, -1, 0); SHF_ASSERT(MAP_FAILED != put_counts, "mmap(): %u: ", errno);
    volatile uint32_t * get_counts = mmap(NULL, SHF_MOD_PAGE(TEST_MAX_PROCESSES*sizeof(uint32_t)), PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_NORESERVE, -1, 0); SHF_ASSERT(MAP_FAILED != get_counts, "mmap(): %u: ", errno);
    volatile uint32_t * mix_counts = mmap(NULL, SHF_MOD_PAGE(TEST_MAX_PROCESSES*sizeof(uint32_t)), PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_NORESERVE, -1, 0); SHF_ASSERT(MAP_FAILED != mix_counts, "mmap(): %u: ", errno);
    volatile uint64_t * start_line = mmap(NULL, SHF_MOD_PAGE(                 3*sizeof(uint64_t)), PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_NORESERVE, -1, 0); SHF_ASSERT(MAP_FAILED != mix_counts, "mmap(): %u: ", errno);
    SHF_ASSERT(sizeof(uint64_t) == sizeof(long), "INTERNAL: expected sizeof(uint64_t) == sizeof(long), but got %lu == %lu", sizeof(uint64_t), sizeof(long));
    start_line[0] = 0;
    start_line[1] = 0;
    start_line[2] = 0;
    for (process = 0; process < processes; process++) {
        pid_t fork_pid = fork();
        if (fork_pid == 0) {     /*child*/
            SHF_DEBUG("test process #%u with pid %5u\n", process, getpid());
            {
                long previous_long_value;
                SHF_UNUSE(previous_long_value);

                previous_long_value = InterlockedExchangeAdd((long volatile *) &start_line[0], 1);
                while (processes != start_line[0]) { SHF_YIELD(); }
                TEST_INIT_CHILD();
                for (uint32_t i = 0; i < (1 + (test_keys / processes)); i++) {
                    uint32_t key = test_keys / processes * process + i;
                    put_counts[process] ++;
                    TEST_PUT();
                }
                TEST_PUT_POST();
                TEST_MIX_PRE();
                usleep(2000000); /* 2 seconds */
                previous_long_value = InterlockedExchangeAdd((long volatile *) &start_line[1], 1);
                while (processes != start_line[1]) { SHF_YIELD(); }
                for (uint32_t i = 0; i < (1 + (test_keys / processes)); i++) {
                    uint32_t key = test_keys / processes * process + i;
                    TEST_MIX();
                }
                TEST_MIX_POST();
                TEST_GET_PRE();
                usleep(2000000); /* 2 seconds */
                previous_long_value = InterlockedExchangeAdd((long volatile *) &start_line[2], 1);
                while (processes != start_line[2]) { SHF_YIELD(); }
                for (uint32_t i = 0; i < (1 + (test_keys / processes)); i++) {
                    uint32_t key = test_keys / processes * process + i;
                    TEST_GET();
                }
                TEST_GET_POST();
                TEST_FINI();
                exit(0);
            }
            break;
        }
        else if (fork_pid > 0) { /*parent*/
            /* loop again */
        }
    }

    /* parent monitors & reports on forked children */
    uint32_t seconds            = 0;
    uint32_t key_total;
    uint32_t key_total_old      = 0;
    uint64_t tabs_mmaps_old     = 0;
    uint64_t tabs_mremaps_old   = 0;
    uint64_t tabs_shrunk_old    = 0;
    uint64_t tabs_parted_old    = 0;
    uint32_t message            = 0;
    const char * message_text = "PUT";
#ifdef SHF_DEBUG_VERSION
    uint64_t lock_conflicts_old = 0;
#endif
    char graph_100[] = "----------------------------------------------------------------------------------------------------";
    fprintf(stderr, "perf testing: " TEST_WHAT "\n");
    fprintf(stderr, "running tests on: via command: '%s'\n",               "cat /proc/cpuinfo | egrep 'model name' | head -n 1" );
    fprintf(stderr, "running tests on: `%s`\n"             , shf_backticks("cat /proc/cpuinfo | egrep 'model name' | head -n 1"));
    do {
        if (0 == seconds % 50) {
#ifdef SHF_DEBUG_VERSION
            fprintf(stderr, "-LOCKC ");
#endif
            fprintf(stderr, "-OP MMAP REMAP SHRK PART TOTAL ------PERCENT OPERATIONS PER PROCESS PER SECOND -OPS\n");
#ifdef SHF_DEBUG_VERSION
            fprintf(stderr, "------ ");
#endif
            fprintf(stderr, "--- -k/s --k/s --/s --/s M-OPS 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 -M/s\n");
        }
        seconds ++;
        // todo: add % system CPU time to per second summary line; why does put require so much system?
#ifdef SHF_DEBUG_VERSION
        {
            uint64_t lock_conflicts = 0;
            for (uint32_t win = 0; win < SHF_WINS_PER_SHF; win++) {
                lock_conflicts += shf->shf_mmap->wins[win].lock.conflicts;
            }
            fprintf(stderr, "%6lu ", lock_conflicts - lock_conflicts_old);
            lock_conflicts_old = lock_conflicts;
        }
#endif
        fprintf(stderr, "%s", message_text);
        {
            uint64_t tabs_mmaps   = 0;
            uint64_t tabs_mremaps = 0;
            uint64_t tabs_shrunk  = 0;
            uint64_t tabs_parted  = 0;
#ifdef TEST_SHF
            for (uint32_t win = 0; win < SHF_WINS_PER_SHF; win++) {
                tabs_mmaps   += shf->shf_mmap->wins[win].tabs_mmaps  ;
                tabs_mremaps += shf->shf_mmap->wins[win].tabs_mremaps;
                tabs_shrunk  += shf->shf_mmap->wins[win].tabs_shrunk ;
                tabs_parted  += shf->shf_mmap->wins[win].tabs_parted ;
            }
#endif
            fprintf(stderr, "%5.1f %5.1f %4llu %4llu",
                (tabs_mmaps   - tabs_mmaps_old  ) / 1000.0,
                (tabs_mremaps - tabs_mremaps_old) / 1000.0,
                (tabs_shrunk  - tabs_shrunk_old )         ,
                (tabs_parted  - tabs_parted_old )         );
            tabs_mmaps_old   = tabs_mmaps  ;
            tabs_mremaps_old = tabs_mremaps;
            tabs_shrunk_old  = tabs_shrunk ;
            tabs_parted_old  = tabs_parted ;
        }
        {
            key_total = 0;
            uint32_t key_total_this_second = 0;
            for (process = 0; process < TEST_MAX_PROCESSES; process++) {
                key_total             += put_counts[process] + get_counts[process] + mix_counts[process];
                key_total_this_second += put_counts[process] + get_counts[process] + mix_counts[process] - counts_old[process];
            }
            fprintf(stderr, " %5.1f", key_total / 1000.0 / 1000.0);
            for (process = 0; process < TEST_MAX_PROCESSES; process++) {
                fprintf(stderr, "%3.0f", (put_counts[process] + get_counts[process] + mix_counts[process] - counts_old[process]) * 100.0 / (0 == key_total_this_second ? 1 : key_total_this_second));
                counts_old[process] = put_counts[process] + get_counts[process] + mix_counts[process];
            }
            uint32_t key_total_per_second = key_total - key_total_old;
            fprintf(stderr, "%5.1f %s\n", key_total_per_second / 1000.0 / 1000.0, &graph_100[100 - (key_total_per_second / 750000)]);
            if      (0 == message && key_total >= (1 * test_keys)) { message ++; message_text = "MIX"; }
            else if (1 == message && key_total >= (2 * test_keys)) { message ++; message_text = "GET"; }
            key_total_old = key_total;
        }
        usleep(1000000); /* one second */
    } while (key_total < (3 * test_keys));
    fprintf(stderr, "* MIX is 2%% (%u) del/put, 98%% (%u) get\n", test_keys * 2 / 100, test_keys * 98 / 100);

    TEST_FINI_MASTER();

EARLY_EXIT:;

    ok(1, "test still alive");
    return exit_status();
} /* main() */
static void
shf_monitor_delete_shf(void)
{
    char du_rm_folder[256];
    SHF_SNPRINTF(1, du_rm_folder, "du -h -d 0 %s ; rm -rf %s/", shf_monitor_path_name, shf_monitor_path_name);
    fprintf(stderr, "shf.monitor: detected pid %u gone; deleted shf; shf size before deletion: %s\n", shf_monitor_parent_pid, shf_backticks(du_rm_folder));
} /* shf_monitor_delete_shf() */