int main(int argc, char *argv[])
{
  int i, j;
  uint64_t *in, *out;
  struct timespec start, stop;
  int fluffed;

  if(posix_memalign((void **)&in,  CACHE_ALIGNMENT,   N_BYTES_PER_BLOCK)
  || posix_memalign((void **)&out, CACHE_ALIGNMENT, 2*N_BYTES_PER_BLOCK)) {
    printf("cannot allocate memory\n");
    return 1;
  }

  //printf("in  = %p\n", in);
  //printf("out = %p\n", out);

#if 0
  in[0] = htobe64(0x0123456789abcdef);
  in[1] = htobe64(0xfedcba9876543210);

  uint8_t * cin = (uint8_t *)in;
  for(i=0; i<2; i++) {
    for(j=0; j<8; j++) {
      printf("%02x ", cin[8*i+j]);
    }
    printf("\n");
  }
  printf("\n");

  paper_fluff(in, out);

  uint8_t * cout = (uint8_t *)out;
  for(i=0; i<4; i++) {
    for(j=0; j<8; j++) {
      printf("%02x ", cout[8*i+j]);
    }
    printf("\n");
  }

  return 0;
}
static void *run(hashpipe_thread_args_t * args)
{
    // Local aliases to shorten access to args fields
    // Our input buffer is a paper_input_databuf
    // Our output buffer is a paper_gpu_input_databuf
    paper_input_databuf_t *db_in = (paper_input_databuf_t *)args->ibuf;
    paper_gpu_input_databuf_t *db_out = (paper_gpu_input_databuf_t *)args->obuf;
    hashpipe_status_t st = args->st;
    const char * status_key = args->thread_desc->skey;

#ifdef DEBUG_SEMS
    fprintf(stderr, "s/tid %lu/                      FLUFf/\n", pthread_self());
#endif

    // Init status variables
    hashpipe_status_lock_safe(&st);
    hputi8(st.buf, "FLUFMCNT", 0);
    hashpipe_status_unlock_safe(&st);

    /* Loop */
    int rv;
    int curblock_in=0;
    int curblock_out=0;
    float gbps, min_gbps;

    struct timespec start, finish;

    while (run_threads()) {

        // Note waiting status,
        // query integrating status
        // and, if armed, start count
        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "waiting");
        hashpipe_status_unlock_safe(&st);

        // Wait for new input block to be filled
        while ((rv=paper_input_databuf_wait_filled(db_in, curblock_in)) != HASHPIPE_OK) {
            if (rv==HASHPIPE_TIMEOUT) {
                hashpipe_status_lock_safe(&st);
                hputs(st.buf, status_key, "blocked_in");
                hashpipe_status_unlock_safe(&st);
                continue;
            } else {
                hashpipe_error(__FUNCTION__, "error waiting for filled databuf");
                pthread_exit(NULL);
                break;
            }
        }

        // Wait for new gpu_input block (our output block) to be free
        while ((rv=paper_gpu_input_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) {
            if (rv==HASHPIPE_TIMEOUT) {
                hashpipe_status_lock_safe(&st);
                hputs(st.buf, status_key, "blocked gpu input");
                hashpipe_status_unlock_safe(&st);
                continue;
            } else {
                hashpipe_error(__FUNCTION__, "error waiting for free databuf");
                pthread_exit(NULL);
                break;
            }
        }

        // Got a new data block, update status
        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "fluffing");
        hputi4(st.buf, "FLUFBKIN", curblock_in);
        hputu8(st.buf, "FLUFMCNT", db_in->block[curblock_in].header.mcnt);
        hashpipe_status_unlock_safe(&st);

        // Copy header and call fluff function
        clock_gettime(CLOCK_MONOTONIC, &start);

        memcpy(&db_out->block[curblock_out].header, &db_in->block[curblock_in].header, sizeof(paper_input_header_t));

        paper_fluff(db_in->block[curblock_in].data, db_out->block[curblock_out].data);

        clock_gettime(CLOCK_MONOTONIC, &finish);

        // Note processing time
        hashpipe_status_lock_safe(&st);
        // Bits per fluff / ns per fluff = Gbps
        hgetr4(st.buf, "FLUFMING", &min_gbps);
        gbps = (float)(8*N_BYTES_PER_BLOCK)/ELAPSED_NS(start,finish);
        hputr4(st.buf, "FLUFGBPS", gbps);
        if(min_gbps == 0 || gbps < min_gbps) {
          hputr4(st.buf, "FLUFMING", gbps);
        }
        hashpipe_status_unlock_safe(&st);

        // Mark input block as free and advance
        paper_input_databuf_set_free(db_in, curblock_in);
        curblock_in = (curblock_in + 1) % db_in->header.n_block;

        // Mark output block as full and advance
        paper_gpu_input_databuf_set_filled(db_out, curblock_out);
        curblock_out = (curblock_out + 1) % db_out->header.n_block;

        /* Check for cancel */
        pthread_testcancel();
    }

    // Thread success!
    return NULL;
}
  }

  return 0;
}
#else



  printf("N_CHAN_PER_PACKET=%u\n", N_CHAN_PER_PACKET);
  printf("N_TIME_PER_PACKET=%u\n", N_TIME_PER_PACKET);
  printf("N_WORDS_PER_PACKET=%lu\n", N_WORDS_PER_PACKET);
  printf("N_PACKETS_PER_BLOCK=%u\n", N_PACKETS_PER_BLOCK);
  printf("N_BYTES_PER_BLOCK=%u\n", N_BYTES_PER_BLOCK);

#ifdef DEBUG_FLUFF
  fluffed = paper_fluff(in, out);
#else
  for(j=0; j<4; j++) {
    clock_gettime(CLOCK_MONOTONIC, &start);
    for(i=0; i<TEST_ITERATIONS; i++) {
      fluffed = paper_fluff(in, out);
    }
    clock_gettime(CLOCK_MONOTONIC, &stop);
    printf("fluffed %d words in %.6f ms (%.3f us per packet, %.3f Gbps)\n",
        fluffed, ELAPSED_NS(start, stop)/1e6/TEST_ITERATIONS,
        ELAPSED_NS(start, stop)/1e3/TEST_ITERATIONS/N_PACKETS_PER_BLOCK,
        (float)(fluffed*TEST_ITERATIONS*8*sizeof(uint64_t))/ELAPSED_NS(start, stop));
  }
#endif // DEBUG_FLUFF_GEN

  return 0;
static void *run(void * _args)
{
    // Cast _args
    struct guppi_thread_args *args = (struct guppi_thread_args *)_args;

#ifdef DEBUG_SEMS
    fprintf(stderr, "s/tid %lu/                      FLUFf/\n", pthread_self());
#endif

    THREAD_RUN_BEGIN(args);

    THREAD_RUN_SET_AFFINITY_PRIORITY(args);

    /* Attach to status shared mem area */
    THREAD_RUN_ATTACH_STATUS(args->instance_id, st);

    /* Attach to paper_input_databuf */
    THREAD_RUN_ATTACH_DATABUF(args->instance_id,
        paper_input_databuf, db_in, args->input_buffer);

    /* Attach to paper_gpu_input_databuf */
    THREAD_RUN_ATTACH_DATABUF(args->instance_id,
        paper_gpu_input_databuf, db_out, args->output_buffer);

    // Init status variables
    guppi_status_lock_safe(&st);
    hputi8(st.buf, "FLUFMCNT", 0);
    guppi_status_unlock_safe(&st);

    /* Loop */
    int rv;
    int curblock_in=0;
    int curblock_out=0;

    struct timespec start, finish;

    while (run_threads) {

        // Note waiting status,
        // query integrating status
        // and, if armed, start count
        guppi_status_lock_safe(&st);
        hputs(st.buf, STATUS_KEY, "waiting");
        guppi_status_unlock_safe(&st);

        // Wait for new input block to be filled
        while ((rv=paper_input_databuf_wait_filled(db_in, curblock_in)) != GUPPI_OK) {
            if (rv==GUPPI_TIMEOUT) {
                guppi_status_lock_safe(&st);
                hputs(st.buf, STATUS_KEY, "blocked_in");
                guppi_status_unlock_safe(&st);
                continue;
            } else {
                guppi_error(__FUNCTION__, "error waiting for filled databuf");
                run_threads=0;
                pthread_exit(NULL);
                break;
            }
        }

        // Wait for new gpu_input block (our output block) to be free
        while ((rv=paper_gpu_input_databuf_wait_free(db_out, curblock_out)) != GUPPI_OK) {
            if (rv==GUPPI_TIMEOUT) {
                guppi_status_lock_safe(&st);
                hputs(st.buf, STATUS_KEY, "blocked gpu input");
                guppi_status_unlock_safe(&st);
                continue;
            } else {
                guppi_error(__FUNCTION__, "error waiting for free databuf");
                run_threads=0;
                pthread_exit(NULL);
                break;
            }
        }

        // Got a new data block, update status
        guppi_status_lock_safe(&st);
        hputs(st.buf, STATUS_KEY, "fluffing");
        hputi4(st.buf, "FLUFBKIN", curblock_in);
        hputu8(st.buf, "FLUFMCNT", db_in->block[curblock_in].header.mcnt);
        guppi_status_unlock_safe(&st);

        // Copy header and call fluff function
        clock_gettime(CLOCK_MONOTONIC, &start);

        memcpy(&db_out->block[curblock_out].header, &db_in->block[curblock_in].header, sizeof(paper_input_header_t));

        paper_fluff(db_in->block[curblock_in].data, db_out->block[curblock_out].data);

        clock_gettime(CLOCK_MONOTONIC, &finish);

        // Note processing time
        guppi_status_lock_safe(&st);
        // Bits per fluff / ns per fluff = Gbps
        hputr4(st.buf, "FLUFGBPS", (float)(8*N_BYTES_PER_BLOCK)/ELAPSED_NS(start,finish));
        guppi_status_unlock_safe(&st);

        // Mark input block as free and advance
        paper_input_databuf_set_free(db_in, curblock_in);
        curblock_in = (curblock_in + 1) % db_in->header.n_block;

        // Mark output block as full and advance
        paper_gpu_input_databuf_set_filled(db_out, curblock_out);
        curblock_out = (curblock_out + 1) % db_out->header.n_block;

        /* Check for cancel */
        pthread_testcancel();
    }
    run_threads=0;

    // Have to close all pushes
    THREAD_RUN_DETACH_DATAUF;
    THREAD_RUN_DETACH_DATAUF;
    THREAD_RUN_DETACH_STATUS;
    THREAD_RUN_END;

    // Thread success!
    return NULL;
}