int main(int argc, char *argv[]) {

    int rv;
    struct vegas_status s;

    rv = vegas_status_attach(&s);
    if (rv!=VEGAS_OK) {
        fprintf(stderr, "Error connecting to shared mem.\n");
        perror(NULL);
        exit(1);
    }

    vegas_status_lock(&s);

    /* Loop over cmd line to fill in params */
    static struct option long_opts[] = {
        {"key",    1, NULL, 'k'},
        {"get",    1, NULL, 'g'},
        {"string", 1, NULL, 's'},
        {"float",  1, NULL, 'f'},
        {"double", 1, NULL, 'd'},
        {"int",    1, NULL, 'i'},
        {"quiet",  0, NULL, 'q'},
        {"clear",  0, NULL, 'C'},
        {"del",    0, NULL, 'D'},
        {0,0,0,0}
    };
    int opt,opti;
    char *key=NULL;
    float flttmp;
    double dbltmp;
    int inttmp;
    int quiet=0, clear=0;
    while ((opt=getopt_long(argc,argv,"k:g:s:f:d:i:qCD",long_opts,&opti))!=-1) {
        switch (opt) {
            case 'k':
                key = optarg;
                break;
            case 'g':
                hgetr8(s.buf, optarg, &dbltmp);
                printf("%g\n", dbltmp);
                break;
            case 's':
                if (key) 
                    hputs(s.buf, key, optarg);
                break;
            case 'f':
                flttmp = atof(optarg);
                if (key) 
                    hputr4(s.buf, key, flttmp);
                break;
            case 'd':
                dbltmp = atof(optarg);
                if (key) 
                    hputr8(s.buf, key, dbltmp);
                break;
            case 'i':
                inttmp = atoi(optarg);
                if (key) 
                    hputi4(s.buf, key, inttmp);
                break;
            case 'D':
                if (key)
                    hdel(s.buf, key);
                break;
            case 'C':
                clear=1;
                break;
            case 'q':
                quiet=1;
                break;
            default:
                break;
        }
    }

    /* If not quiet, print out buffer */
    if (!quiet) { 
        printf(s.buf); printf("\n"); 
    }

    vegas_status_unlock(&s);

    if (clear) 
        vegas_status_clear(&s);

    exit(0);
}
int main(int argc, char *argv[]) {

    int instance_id = 0;
    hashpipe_status_t *s;

    /* Loop over cmd line to fill in params */
    static struct option long_opts[] = {
        {"help",   0, NULL, 'h'},
        {"shmkey", 1, NULL, 'K'},
        {"key",    1, NULL, 'k'},
        {"get",    1, NULL, 'g'},
        {"string", 1, NULL, 's'},
        {"float",  1, NULL, 'f'},
        {"double", 1, NULL, 'd'},
        {"int",    1, NULL, 'i'},
        {"verbose",  0, NULL, 'v'},
        {"clear",  0, NULL, 'C'},
        {"del",    0, NULL, 'D'},
        {"query",  1, NULL, 'Q'},
        {"instance", 1, NULL, 'I'},
        {0,0,0,0}
    };
    int opt,opti;
    char *key=NULL;
    char value[81];
    float flttmp;
    double dbltmp;
    int inttmp;
    int verbose=0, clear=0;
    char keyfile[1000];
    while ((opt=getopt_long(argc,argv,"hk:g:s:f:d:i:vCDQ:K:I:",long_opts,&opti))!=-1) {
        switch (opt) {
            case 'K': // Keyfile
                snprintf(keyfile, sizeof(keyfile), "HASHPIPE_KEYFILE=%s", optarg);
                keyfile[sizeof(keyfile)-1] = '\0';
                putenv(keyfile);
                break;
            case 'I':
                instance_id = atoi(optarg);
                break;
            case 'k':
                key = optarg;
                break;
            case 'Q':
                s = get_status_buffer(instance_id);
                hashpipe_status_lock(s);
                hgets(s->buf, optarg, 80, value);
                hashpipe_status_unlock(s);
                value[80] = '\0';
                printf("%s\n", value);
                break;
            case 'g':
                s = get_status_buffer(instance_id);
                hashpipe_status_lock(s);
                hgetr8(s->buf, optarg, &dbltmp);
                hashpipe_status_unlock(s);
                printf("%g\n", dbltmp);
                break;
            case 's':
                if (key) {
                    s = get_status_buffer(instance_id);
                    hashpipe_status_lock(s);
                    hputs(s->buf, key, optarg);
                    hashpipe_status_unlock(s);
                }
                break;
            case 'f':
                flttmp = atof(optarg);
                if (key) {
                    s = get_status_buffer(instance_id);
                    hashpipe_status_lock(s);
                    hputr4(s->buf, key, flttmp);
                    hashpipe_status_unlock(s);
                }
                break;
            case 'd':
                dbltmp = atof(optarg);
                if (key) {
                    s = get_status_buffer(instance_id);
                    hashpipe_status_lock(s);
                    hputr8(s->buf, key, dbltmp);
                    hashpipe_status_unlock(s);
                }
                break;
            case 'i':
                inttmp = atoi(optarg);
                if (key) {
                    s = get_status_buffer(instance_id);
                    hashpipe_status_lock(s);
                    hputi4(s->buf, key, inttmp);
                    hashpipe_status_unlock(s);
                }
                break;
            case 'D':
                if (key) {
                    s = get_status_buffer(instance_id);
                    hashpipe_status_lock(s);
                    hdel(s->buf, key);
                    hashpipe_status_unlock(s);
                }
                break;
            case 'C':
                clear=1;
                break;
            case 'v':
                verbose=1;
                break;
            case 'h':
                usage();
                return 0;
            case '?': // Command line parsing error
            default:
                usage();
                exit(1);
                break;
        }
    }

    s = get_status_buffer(instance_id);

    /* If verbose, print out buffer */
    if (verbose) { 
        hashpipe_status_lock(s);
        printf("%s\n", s->buf);
        hashpipe_status_unlock(s);
    }

    if (clear) 
        hashpipe_status_clear(s);

    exit(0);
}
static void *run(void * _args)
{
    // Cast _args
    struct guppi_thread_args *args = (struct guppi_thread_args *)_args;

#ifdef DEBUG_SEMS
    fprintf(stderr, "s/tid %lu/                      FLUFf/\n", pthread_self());
#endif

    THREAD_RUN_BEGIN(args);

    THREAD_RUN_SET_AFFINITY_PRIORITY(args);

    /* Attach to status shared mem area */
    THREAD_RUN_ATTACH_STATUS(args->instance_id, st);

    /* Attach to paper_input_databuf */
    THREAD_RUN_ATTACH_DATABUF(args->instance_id,
        paper_input_databuf, db_in, args->input_buffer);

    /* Attach to paper_gpu_input_databuf */
    THREAD_RUN_ATTACH_DATABUF(args->instance_id,
        paper_gpu_input_databuf, db_out, args->output_buffer);

    // Init status variables
    guppi_status_lock_safe(&st);
    hputi8(st.buf, "FLUFMCNT", 0);
    guppi_status_unlock_safe(&st);

    /* Loop */
    int rv;
    int curblock_in=0;
    int curblock_out=0;

    struct timespec start, finish;

    while (run_threads) {

        // Note waiting status,
        // query integrating status
        // and, if armed, start count
        guppi_status_lock_safe(&st);
        hputs(st.buf, STATUS_KEY, "waiting");
        guppi_status_unlock_safe(&st);

        // Wait for new input block to be filled
        while ((rv=paper_input_databuf_wait_filled(db_in, curblock_in)) != GUPPI_OK) {
            if (rv==GUPPI_TIMEOUT) {
                guppi_status_lock_safe(&st);
                hputs(st.buf, STATUS_KEY, "blocked_in");
                guppi_status_unlock_safe(&st);
                continue;
            } else {
                guppi_error(__FUNCTION__, "error waiting for filled databuf");
                run_threads=0;
                pthread_exit(NULL);
                break;
            }
        }

        // Wait for new gpu_input block (our output block) to be free
        while ((rv=paper_gpu_input_databuf_wait_free(db_out, curblock_out)) != GUPPI_OK) {
            if (rv==GUPPI_TIMEOUT) {
                guppi_status_lock_safe(&st);
                hputs(st.buf, STATUS_KEY, "blocked gpu input");
                guppi_status_unlock_safe(&st);
                continue;
            } else {
                guppi_error(__FUNCTION__, "error waiting for free databuf");
                run_threads=0;
                pthread_exit(NULL);
                break;
            }
        }

        // Got a new data block, update status
        guppi_status_lock_safe(&st);
        hputs(st.buf, STATUS_KEY, "fluffing");
        hputi4(st.buf, "FLUFBKIN", curblock_in);
        hputu8(st.buf, "FLUFMCNT", db_in->block[curblock_in].header.mcnt);
        guppi_status_unlock_safe(&st);

        // Copy header and call fluff function
        clock_gettime(CLOCK_MONOTONIC, &start);

        memcpy(&db_out->block[curblock_out].header, &db_in->block[curblock_in].header, sizeof(paper_input_header_t));

        paper_fluff(db_in->block[curblock_in].data, db_out->block[curblock_out].data);

        clock_gettime(CLOCK_MONOTONIC, &finish);

        // Note processing time
        guppi_status_lock_safe(&st);
        // Bits per fluff / ns per fluff = Gbps
        hputr4(st.buf, "FLUFGBPS", (float)(8*N_BYTES_PER_BLOCK)/ELAPSED_NS(start,finish));
        guppi_status_unlock_safe(&st);

        // Mark input block as free and advance
        paper_input_databuf_set_free(db_in, curblock_in);
        curblock_in = (curblock_in + 1) % db_in->header.n_block;

        // Mark output block as full and advance
        paper_gpu_input_databuf_set_filled(db_out, curblock_out);
        curblock_out = (curblock_out + 1) % db_out->header.n_block;

        /* Check for cancel */
        pthread_testcancel();
    }
    run_threads=0;

    // Have to close all pushes
    THREAD_RUN_DETACH_DATAUF;
    THREAD_RUN_DETACH_DATAUF;
    THREAD_RUN_DETACH_STATUS;
    THREAD_RUN_END;

    // Thread success!
    return NULL;
}
static void *run(hashpipe_thread_args_t * args)
{
    // Local aliases to shorten access to args fields
    // Our input buffer is a paper_input_databuf
    // Our output buffer is a paper_gpu_input_databuf
    paper_input_databuf_t *db_in = (paper_input_databuf_t *)args->ibuf;
    paper_gpu_input_databuf_t *db_out = (paper_gpu_input_databuf_t *)args->obuf;
    hashpipe_status_t st = args->st;
    const char * status_key = args->thread_desc->skey;

#ifdef DEBUG_SEMS
    fprintf(stderr, "s/tid %lu/                      FLUFf/\n", pthread_self());
#endif

    // Init status variables
    hashpipe_status_lock_safe(&st);
    hputi8(st.buf, "FLUFMCNT", 0);
    hashpipe_status_unlock_safe(&st);

    /* Loop */
    int rv;
    int curblock_in=0;
    int curblock_out=0;
    float gbps, min_gbps;

    struct timespec start, finish;

    while (run_threads()) {

        // Note waiting status,
        // query integrating status
        // and, if armed, start count
        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "waiting");
        hashpipe_status_unlock_safe(&st);

        // Wait for new input block to be filled
        while ((rv=paper_input_databuf_wait_filled(db_in, curblock_in)) != HASHPIPE_OK) {
            if (rv==HASHPIPE_TIMEOUT) {
                hashpipe_status_lock_safe(&st);
                hputs(st.buf, status_key, "blocked_in");
                hashpipe_status_unlock_safe(&st);
                continue;
            } else {
                hashpipe_error(__FUNCTION__, "error waiting for filled databuf");
                pthread_exit(NULL);
                break;
            }
        }

        // Wait for new gpu_input block (our output block) to be free
        while ((rv=paper_gpu_input_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) {
            if (rv==HASHPIPE_TIMEOUT) {
                hashpipe_status_lock_safe(&st);
                hputs(st.buf, status_key, "blocked gpu input");
                hashpipe_status_unlock_safe(&st);
                continue;
            } else {
                hashpipe_error(__FUNCTION__, "error waiting for free databuf");
                pthread_exit(NULL);
                break;
            }
        }

        // Got a new data block, update status
        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "fluffing");
        hputi4(st.buf, "FLUFBKIN", curblock_in);
        hputu8(st.buf, "FLUFMCNT", db_in->block[curblock_in].header.mcnt);
        hashpipe_status_unlock_safe(&st);

        // Copy header and call fluff function
        clock_gettime(CLOCK_MONOTONIC, &start);

        memcpy(&db_out->block[curblock_out].header, &db_in->block[curblock_in].header, sizeof(paper_input_header_t));

        paper_fluff(db_in->block[curblock_in].data, db_out->block[curblock_out].data);

        clock_gettime(CLOCK_MONOTONIC, &finish);

        // Note processing time
        hashpipe_status_lock_safe(&st);
        // Bits per fluff / ns per fluff = Gbps
        hgetr4(st.buf, "FLUFMING", &min_gbps);
        gbps = (float)(8*N_BYTES_PER_BLOCK)/ELAPSED_NS(start,finish);
        hputr4(st.buf, "FLUFGBPS", gbps);
        if(min_gbps == 0 || gbps < min_gbps) {
          hputr4(st.buf, "FLUFMING", gbps);
        }
        hashpipe_status_unlock_safe(&st);

        // Mark input block as free and advance
        paper_input_databuf_set_free(db_in, curblock_in);
        curblock_in = (curblock_in + 1) % db_in->header.n_block;

        // Mark output block as full and advance
        paper_gpu_input_databuf_set_filled(db_out, curblock_out);
        curblock_out = (curblock_out + 1) % db_out->header.n_block;

        /* Check for cancel */
        pthread_testcancel();
    }

    // Thread success!
    return NULL;
}
static void *run(hashpipe_thread_args_t * args, int doCPU)
{
    // Local aliases to shorten access to args fields
    paper_gpu_input_databuf_t *db_in = (paper_gpu_input_databuf_t *)args->ibuf;
    paper_output_databuf_t *db_out = (paper_output_databuf_t *)args->obuf;
    hashpipe_status_t st = args->st;
    const char * status_key = args->thread_desc->skey;

#ifdef DEBUG_SEMS
    fprintf(stderr, "s/tid %lu/                      GPU/\n", pthread_self());
#endif

    // Init integration control status variables
    int gpu_dev = 0;
    hashpipe_status_lock_safe(&st);
    hputs(st.buf,  "INTSTAT", "off");
    hputi8(st.buf, "INTSYNC", 0);
    hputi4(st.buf, "INTCOUNT", N_SUB_BLOCKS_PER_INPUT_BLOCK);
    hputi8(st.buf, "GPUDUMPS", 0);
    hgeti4(st.buf, "GPUDEV", &gpu_dev); // No change if not found
    hputi4(st.buf, "GPUDEV", gpu_dev);
    hashpipe_status_unlock_safe(&st);

    /* Loop */
    int rv;
    char integ_status[17];
    uint64_t start_mcount, last_mcount=0;
    uint64_t gpu_dumps=0;
    int int_count; // Number of blocks to integrate per dump
    int xgpu_error = 0;
    int curblock_in=0;
    int curblock_out=0;

    struct timespec start, stop;
    uint64_t elapsed_gpu_ns  = 0;
    uint64_t gpu_block_count = 0;

    // Initialize context to point at first input and output memory blocks.
    // This seems redundant since we do this just before calling
    // xgpuCudaXengine, but we need to pass something in for array_h and
    // matrix_x to prevent xgpuInit from allocating memory.
    XGPUContext context;
    context.array_h = (ComplexInput *)db_in->block[0].data;
    context.array_len = (db_in->header.n_block * sizeof(paper_gpu_input_block_t) - sizeof(paper_input_header_t)) / sizeof(ComplexInput);
    context.matrix_h = (Complex *)db_out->block[0].data;
    context.matrix_len = (db_out->header.n_block * sizeof(paper_output_block_t) - sizeof(paper_output_header_t)) / sizeof(Complex);

    xgpu_error = xgpuInit(&context, gpu_dev);
    if (XGPU_OK != xgpu_error) {
        fprintf(stderr, "ERROR: xGPU initialization failed (error code %d)\n", xgpu_error);
        return THREAD_ERROR;
    }

    while (run_threads()) {

        // Note waiting status,
        // query integrating status
        // and, if armed, start count
        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "waiting");
        hgets(st.buf,  "INTSTAT", 16, integ_status);
        hgeti8(st.buf, "INTSYNC", (long long*)&start_mcount);
        hashpipe_status_unlock_safe(&st);

        // Wait for new input block to be filled
        while ((rv=hashpipe_databuf_wait_filled((hashpipe_databuf_t *)db_in, curblock_in)) != HASHPIPE_OK) {
            if (rv==HASHPIPE_TIMEOUT) {
                hashpipe_status_lock_safe(&st);
                hputs(st.buf, status_key, "blocked_in");
                hashpipe_status_unlock_safe(&st);
                continue;
            } else {
                hashpipe_error(__FUNCTION__, "error waiting for filled databuf");
                pthread_exit(NULL);
                break;
            }
        }

        // Got a new data block, update status and determine how to handle it
        hashpipe_status_lock_safe(&st);
        hputi4(st.buf, "GPUBLKIN", curblock_in);
        hputu8(st.buf, "GPUMCNT", db_in->block[curblock_in].header.mcnt);
        hashpipe_status_unlock_safe(&st);

        // If integration status "off"
        if(!strcmp(integ_status, "off")) {
            // Mark input block as free and advance
            hashpipe_databuf_set_free((hashpipe_databuf_t *)db_in, curblock_in);
            curblock_in = (curblock_in + 1) % db_in->header.n_block;
            // Skip to next input buffer
            continue;
        }

        // If integration status is "start"
        if(!strcmp(integ_status, "start")) {
            // If buffer mcount < start_mcount (i.e. not there yet)
            if(db_in->block[curblock_in].header.mcnt < start_mcount) {
              // Drop input buffer
              // Mark input block as free and advance
              hashpipe_databuf_set_free((hashpipe_databuf_t *)db_in, curblock_in);
              curblock_in = (curblock_in + 1) % db_in->header.n_block;
              // Skip to next input buffer
              continue;
            // Else if mcount == start_mcount (time to start)
            } else if(db_in->block[curblock_in].header.mcnt == start_mcount) {
              // Set integration status to "on"
              // Read integration count (INTCOUNT)
              fprintf(stderr, "--- integration on ---\n");
              strcpy(integ_status, "on");
              hashpipe_status_lock_safe(&st);
              hputs(st.buf,  "INTSTAT", integ_status);
              hgeti4(st.buf, "INTCOUNT", &int_count);
              hashpipe_status_unlock_safe(&st);
              // Compute last mcount
              last_mcount = start_mcount + (int_count-1) * N_SUB_BLOCKS_PER_INPUT_BLOCK;
            // Else (missed starting mcount)
            } else {
              // Handle missed start of integration
              // TODO!
              fprintf(stderr, "--- mcnt=%06lx > start_mcnt=%06lx ---\n",
                  db_in->block[curblock_in].header.mcnt, start_mcount);
            }
        }

        // Integration status is "on" or "stop"

        // Note processing status
        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "processing gpu");
        hashpipe_status_unlock_safe(&st);


        // Setup for current chunk
        context.input_offset = curblock_in * sizeof(paper_gpu_input_block_t) / sizeof(ComplexInput);
        context.output_offset = curblock_out * sizeof(paper_output_block_t) / sizeof(Complex);

        // Call CUDA X engine function
        int doDump = 0;
        // Dump if this is the last block or we are doing both CPU and GPU
        // (GPU and CPU test mode always dumps every input block)
        if(db_in->block[curblock_in].header.mcnt >= last_mcount || doCPU) {
          doDump = 1;

          // Check whether we missed the end of integration.  If we get a block
          // whose mcnt is greater than last_mcount, then for some reason (e.g.
          // networking problems) we didn't see a block whose mcnt was
          // last_mcount.  This should "never" happen, but it has been seen to
          // occur when the 10 GbE links have many errors.
          if(db_in->block[curblock_in].header.mcnt > last_mcount) {
            // Can't do much error recovery, so just log it.
            fprintf(stderr, "--- mcnt=%06lx > last_mcnt=%06lx ---\n",
                db_in->block[curblock_in].header.mcnt, last_mcount);
          }

          // Wait for new output block to be free
          while ((rv=paper_output_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) {
              if (rv==HASHPIPE_TIMEOUT) {
                  hashpipe_status_lock_safe(&st);
                  hputs(st.buf, status_key, "blocked gpu out");
                  hashpipe_status_unlock_safe(&st);
                  continue;
              } else {
                  hashpipe_error(__FUNCTION__, "error waiting for free databuf");
                  pthread_exit(NULL);
                  break;
              }
          }
        }

        clock_gettime(CLOCK_MONOTONIC, &start);

        xgpuCudaXengine(&context, doDump ? SYNCOP_DUMP : SYNCOP_SYNC_TRANSFER);

        clock_gettime(CLOCK_MONOTONIC, &stop);
        elapsed_gpu_ns += ELAPSED_NS(start, stop);
        gpu_block_count++;

        if(doDump) {
          clock_gettime(CLOCK_MONOTONIC, &start);
          xgpuClearDeviceIntegrationBuffer(&context);
          clock_gettime(CLOCK_MONOTONIC, &stop);
          elapsed_gpu_ns += ELAPSED_NS(start, stop);

          // TODO Maybe need to subtract all or half the integration time here
          // depending on recevier's expectations.
          db_out->block[curblock_out].header.mcnt = last_mcount;
          // If integration status if "stop"
          if(!strcmp(integ_status, "stop")) {
            // Set integration status to "off"
            strcpy(integ_status, "off");
            hashpipe_status_lock_safe(&st);
            hputs(st.buf,  "INTSTAT", integ_status);
            hashpipe_status_unlock_safe(&st);
          } else {
            // Advance last_mcount for end of next integration
            last_mcount += int_count * N_SUB_BLOCKS_PER_INPUT_BLOCK;
          }

          // Mark output block as full and advance
          paper_output_databuf_set_filled(db_out, curblock_out);
          curblock_out = (curblock_out + 1) % db_out->header.n_block;
          // TODO Need to handle or at least check for overflow!

          // Update GPU dump counter and GPU Gbps
          gpu_dumps++;
          hashpipe_status_lock_safe(&st);
          hputi8(st.buf, "GPUDUMPS", gpu_dumps);
          hputr4(st.buf, "GPUGBPS", (float)(8*N_FLUFFED_BYTES_PER_BLOCK*gpu_block_count)/elapsed_gpu_ns);
          hashpipe_status_unlock_safe(&st);

          // Start new average
          elapsed_gpu_ns  = 0;
          gpu_block_count = 0;
        }

        if(doCPU) {

            /* Note waiting status */
            hashpipe_status_lock_safe(&st);
            hputs(st.buf, status_key, "waiting");
            hashpipe_status_unlock_safe(&st);

            // Wait for new output block to be free
            while ((rv=paper_output_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) {
                if (rv==HASHPIPE_TIMEOUT) {
                    hashpipe_status_lock_safe(&st);
                    hputs(st.buf, status_key, "blocked cpu out");
                    hashpipe_status_unlock_safe(&st);
                    continue;
                } else {
                    hashpipe_error(__FUNCTION__, "error waiting for free databuf");
                    pthread_exit(NULL);
                    break;
                }
            }

            // Note "processing cpu" status, current input block
            hashpipe_status_lock_safe(&st);
            hputs(st.buf, status_key, "processing cpu");
            hashpipe_status_unlock_safe(&st);

            /*
             * Call CPU X engine function
             */
            xgpuOmpXengine((Complex *)db_out->block[curblock_out].data, context.array_h);

            // Mark output block as full and advance
            paper_output_databuf_set_filled(db_out, curblock_out);
            curblock_out = (curblock_out + 1) % db_out->header.n_block;
            // TODO Need to handle or at least check for overflow!
        }

        // Mark input block as free and advance
        hashpipe_databuf_set_free((hashpipe_databuf_t *)db_in, curblock_in);
        curblock_in = (curblock_in + 1) % db_in->header.n_block;

        /* Check for cancel */
        pthread_testcancel();
    }

    xgpuFree(&context);

    // Thread success!
    return NULL;
}
static void *run(hashpipe_thread_args_t * args)
{
    s6_input_databuf_t *db  = (s6_input_databuf_t *)args->obuf;
    hashpipe_status_t *p_st = &(args->st);

    hashpipe_status_t st = args->st;
    const char * status_key = args->thread_desc->skey;

    //s6_input_block_t fake_data_block;

    /* Main loop */
    int i, rv;
    uint64_t mcnt = 0;
    uint64_t num_coarse_chan = N_COARSE_CHAN;
    uint64_t *data;
    int block_idx = 0;
    int error_count = 0, max_error_count = 0;
    float error, max_error = 0.0;
    int gen_fake = 0;

    hashpipe_status_lock_safe(&st);
    //hashpipe_status_lock_safe(p_st);
    hputi4(st.buf, "NUMCCHAN", N_COARSE_CHAN);
    hputi4(st.buf, "NUMFCHAN", N_FINE_CHAN);
    hputi4(st.buf, "NUMBBEAM", N_BYTES_PER_BEAM);
    hputi4(st.buf, "NUMBBLOC", sizeof(s6_input_block_t));
    hputi4(st.buf, "THRESHLD", POWER_THRESH);
    hgeti4(st.buf, "GENFAKE", &gen_fake);
    hashpipe_status_unlock_safe(&st);
    //hashpipe_status_unlock_safe(p_st);

    time_t t, prior_t;
    prior_t = time(&prior_t);

    while (run_threads()) {

        hashpipe_status_lock_safe(&st);
        //hashpipe_status_lock_safe(p_st);
        hputi4(st.buf, "NETBKOUT", block_idx);
        hputs(st.buf, status_key, "waiting");
        hashpipe_status_unlock_safe(&st);
        //hashpipe_status_unlock_safe(p_st);
 
        t = time(&t);
        fprintf(stderr, "elapsed seconds for block %d : %ld\n", block_idx, t - prior_t);
        prior_t = t;
        // Wait for data
        struct timespec sleep_dur, rem_sleep_dur;
        sleep_dur.tv_sec = 1;
        sleep_dur.tv_nsec = 0;
        //fprintf(stderr, "fake net thread sleeping for %7.5f seconds\n", 
        //        sleep_dur.tv_sec + (double)sleep_dur.tv_nsec/1000000000.0);
        nanosleep(&sleep_dur, &rem_sleep_dur);
	
        /* Wait for new block to be free, then clear it
         * if necessary and fill its header with new values.
         */
        while ((rv=s6_input_databuf_wait_free(db, block_idx)) 
                != HASHPIPE_OK) {
            if (rv==HASHPIPE_TIMEOUT) {
                hashpipe_status_lock_safe(&st);
                hputs(st.buf, status_key, "blocked");
                hashpipe_status_unlock_safe(&st);
                continue;
            } else {
                hashpipe_error(__FUNCTION__, "error waiting for free databuf");
                pthread_exit(NULL);
                break;
            }
        }

        hashpipe_status_lock_safe(&st);
        hputs(st.buf, status_key, "receiving");
        hashpipe_status_unlock_safe(&st);
 
        // populate block header
        db->block[block_idx].header.mcnt = mcnt;
        db->block[block_idx].header.coarse_chan_id = 321;
        db->block[block_idx].header.num_coarse_chan = num_coarse_chan;
        memset(db->block[block_idx].header.missed_pkts, 0, sizeof(uint64_t) * N_BEAM_SLOTS);

        if(gen_fake) {
            gen_fake = 0;
            // gen fake data for all beams, all blocks   
            // TODO vary data by beam
            fprintf(stderr, "generating fake data to block 0 beam 0...");
            gen_fake_data(&(db->block[0].data[0]));
            fprintf(stderr, " done\n");
            fprintf(stderr, "copying to block 0 beam");
            for(int beam_i = 1; beam_i < N_BEAMS; beam_i++) {
                fprintf(stderr, " %d", beam_i);
                memcpy((void *)&db->block[0].data[beam_i*N_BYTES_PER_BEAM/sizeof(uint64_t)], 
                    (void *)&db->block[0].data[0], 
                    N_BYTES_PER_BEAM);
            }
            fprintf(stderr, " done\n");
            fprintf(stderr, "copying to block");
            for(int block_i = 1; block_i < N_INPUT_BLOCKS; block_i++) {
                fprintf(stderr, " %d", block_i);
                memcpy((void *)&db->block[block_i].data[0], 
                    (void *)&db->block[0].data[0], 
                    N_DATA_BYTES_PER_BLOCK);
            }
            fprintf(stderr, " done\n");
        }

        hashpipe_status_lock_safe(&st);
        hputr4(st.buf, "NETMXERR", max_error);
        hputi4(st.buf, "NETERCNT", error_count);
        hputi4(st.buf, "NETMXECT", max_error_count);
        hashpipe_status_unlock_safe(&st);

        // Mark block as full
        s6_input_databuf_set_filled(db, block_idx);

        // Setup for next block
        block_idx = (block_idx + 1) % db->header.n_block;
        mcnt++;
        // uncomment the following to test dynamic setting of num_coarse_chan
        //num_coarse_chan--;

        /* Will exit if thread has been cancelled */
        pthread_testcancel();
    }

    // Thread success!
    return THREAD_OK;
}
static void *run(hashpipe_thread_args_t * args)
{
    // Local aliases to shorten access to args fields
    // Our output buffer happens to be a paper_input_databuf
    hashpipe_status_t st = args->st;
    const char * status_key = args->thread_desc->skey;

    st_p = &st;	// allow global (this source file) access to the status buffer

    // Get inital value for crc32 function
    uint32_t init_crc = crc32(0,0,0);

    // Flag that holds off the crc thread
    int holdoff = 1;

    // Force ourself into the hold off state
    hashpipe_status_lock_safe(&st);
    hputi4(st.buf, "CRCHOLD", 1);
    hashpipe_status_unlock_safe(&st);

    while(holdoff) {
	// We're not in any hurry to startup
	sleep(1);
	hashpipe_status_lock_safe(&st);
	// Look for CRCHOLD value
	hgeti4(st.buf, "CRCHOLD", &holdoff);
	if(!holdoff) {
	    // Done holding, so delete the key
	    hdel(st.buf, "CRCHOLD");
	}
	hashpipe_status_unlock_safe(&st);
    }

    /* Read network params */
    struct hashpipe_udp_params up = {
	.bindhost = "0.0.0.0",
	.bindport = 8511,
	.packet_size = 8200
    };
    hashpipe_status_lock_safe(&st);
    // Get info from status buffer if present (no change if not present)
    hgets(st.buf, "BINDHOST", 80, up.bindhost);
    hgeti4(st.buf, "BINDPORT", &up.bindport);
    // Store bind host/port info etc in status buffer
    hputs(st.buf, "BINDHOST", up.bindhost);
    hputi4(st.buf, "BINDPORT", up.bindport);
    hputu4(st.buf, "CRCPKOK", 0);
    hputu4(st.buf, "CRCPKERR", 0);
    hputs(st.buf, status_key, "running");
    hashpipe_status_unlock_safe(&st);

    struct hashpipe_udp_packet p;

    /* Give all the threads a chance to start before opening network socket */
    sleep(1);


    /* Set up UDP socket */
    int rv = hashpipe_udp_init(&up);
    if (rv!=HASHPIPE_OK) {
        hashpipe_error("paper_crc_thread",
                "Error opening UDP socket.");
        pthread_exit(NULL);
    }
    pthread_cleanup_push((void *)hashpipe_udp_close, &up);

    /* Main loop */
    uint64_t packet_count = 0;
    uint64_t good_count = 0;
    uint64_t error_count = 0;
    uint64_t elapsed_wait_ns = 0;
    uint64_t elapsed_recv_ns = 0;
    uint64_t elapsed_proc_ns = 0;
    float ns_per_wait = 0.0;
    float ns_per_recv = 0.0;
    float ns_per_proc = 0.0;
    struct timespec start, stop;
    struct timespec recv_start, recv_stop;
    packet_header_t hdr;

    while (run_threads()) {

        /* Read packet */
	clock_gettime(CLOCK_MONOTONIC, &recv_start);
	do {
	    clock_gettime(CLOCK_MONOTONIC, &start);
	    p.packet_size = recv(up.sock, p.data, HASHPIPE_MAX_PACKET_SIZE, 0);
	    clock_gettime(CLOCK_MONOTONIC, &recv_stop);
	} while (p.packet_size == -1 && (errno == EAGAIN || errno == EWOULDBLOCK) && run_threads());

	// Break out of loop if stopping
	if(!run_threads()) break;

	// Increment packet count
	packet_count++;

	// Check CRC
        if(crc32(init_crc, (/*const?*/ uint8_t *)p.data, p.packet_size) == 0xffffffff) {
	    // CRC OK! Increment good counter
	    good_count++;
	} else {
	    // CRC error!  Increment error counter
	    error_count++;

	    // Log message
	    get_header(&p, &hdr);
	    hashpipe_warn("paper_crc", "CRC error mcnt %llu ; fid %u ; xid %u",
		    hdr.mcnt, hdr.fid, hdr.xid);
	}

	clock_gettime(CLOCK_MONOTONIC, &stop);
	elapsed_wait_ns += ELAPSED_NS(recv_start, start);
	elapsed_recv_ns += ELAPSED_NS(start, recv_stop);
	elapsed_proc_ns += ELAPSED_NS(recv_stop, stop);

        if(packet_count % 1000 == 0) {
	    // Compute stats
	    get_header(&p, &hdr);
            ns_per_wait = (float)elapsed_wait_ns / packet_count;
            ns_per_recv = (float)elapsed_recv_ns / packet_count;
            ns_per_proc = (float)elapsed_proc_ns / packet_count;

            // Update status
            hashpipe_status_lock_busywait_safe(&st);
            hputu8(st.buf, "CRCMCNT", hdr.mcnt);
	    // Gbps = bits_per_packet / ns_per_packet
	    // (N_BYTES_PER_PACKET excludes header, so +8 for the header)
            hputr4(st.buf, "CRCGBPS", 8*(N_BYTES_PER_PACKET+8)/(ns_per_recv+ns_per_proc));
            hputr4(st.buf, "CRCWATNS", ns_per_wait);
            hputr4(st.buf, "CRCRECNS", ns_per_recv);
            hputr4(st.buf, "CRCPRCNS", ns_per_proc);
	    // TODO Provide some way to recognize request to zero out the
	    // CRCERR and CRCOK fields.
	    hputu8(st.buf, "CRCPKOK",  good_count);
	    hputu8(st.buf, "CRCPKERR", error_count);
            hashpipe_status_unlock_safe(&st);

	    // Start new average
	    elapsed_wait_ns = 0;
	    elapsed_recv_ns = 0;
	    elapsed_proc_ns = 0;
	    packet_count = 0;
        }

        /* Will exit if thread has been cancelled */
        pthread_testcancel();
    }

    /* Have to close all push's */
    pthread_cleanup_pop(1); /* Closes push(hashpipe_udp_close) */

    return NULL;
}

static hashpipe_thread_desc_t crc_thread = {
    name: "paper_crc_thread",
    skey: "CRCSTAT",
    init: NULL,
    run:  run,
    ibuf_desc: {NULL},
static void *run(void * _args)
{
    // Cast _args
    struct guppi_thread_args *args = (struct guppi_thread_args *)_args;

    THREAD_RUN_BEGIN(args);

    THREAD_RUN_SET_AFFINITY_PRIORITY(args);

    THREAD_RUN_ATTACH_STATUS(args->instance_id, st);

    // Attach to paper_ouput_databuf
    THREAD_RUN_ATTACH_DATABUF(args->instance_id,
        paper_output_databuf, db, args->input_buffer);

    // Setup socket and message structures
    int sockfd;
    unsigned int xengine_id = 0;
    struct timespec packet_delay = {
      .tv_sec = 0,
      .tv_nsec = PACKET_DELAY_NS
    };

    guppi_status_lock_safe(&st);
    hgetu4(st.buf, "XID", &xengine_id); // No change if not found
    hputu4(st.buf, "XID", xengine_id);
    hputu4(st.buf, "OUTDUMPS", 0);
    guppi_status_unlock_safe(&st);

    pkt_t pkt;
    pkt.hdr.header = HEADER;
    pkt.hdr.instids = INSTIDS(xengine_id);
    pkt.hdr.pktinfo = PKTINFO(BYTES_PER_PACKET);
    pkt.hdr.heaplen = HEAPLEN;

    // TODO Get catcher hostname and port from somewhere

#ifndef CATCHER_PORT
#define CATCHER_PORT 7148
#endif
#define stringify2(x) #x
#define stringify(x) stringify2(x)

    // Open socket
    sockfd = open_udp_socket("catcher", stringify(CATCHER_PORT));
    if(sockfd == -1) {
        guppi_error(__FUNCTION__, "error opening socket");
        run_threads=0;
        pthread_exit(NULL);
    }

#ifdef TEST_INDEX_CALCS
    int i, j;
    for(i=0; i<32; i++) {
      for(j=i; j<32; j++) {
        regtile_index(2*i, 2*j);
      }
    }
    for(i=0; i<32; i++) {
      for(j=i; j<32; j++) {
        casper_index(2*i, 2*j);
      }
    }
    run_threads=0;
#endif

    /* Main loop */
    int rv;
    int casper_chan, gpu_chan;
    int baseline;
    unsigned int dumps = 0;
    int block_idx = 0;
    struct timespec start, stop;
    signal(SIGINT,cc);
    signal(SIGTERM,cc);
    while (run_threads) {

        guppi_status_lock_safe(&st);
        hputs(st.buf, STATUS_KEY, "waiting");
        guppi_status_unlock_safe(&st);

        // Wait for new block to be filled
        while ((rv=paper_output_databuf_wait_filled(db, block_idx))
                != GUPPI_OK) {
            if (rv==GUPPI_TIMEOUT) {
                guppi_status_lock_safe(&st);
                hputs(st.buf, STATUS_KEY, "blocked");
                guppi_status_unlock_safe(&st);
                continue;
            } else {
                guppi_error(__FUNCTION__, "error waiting for filled databuf");
                run_threads=0;
                pthread_exit(NULL);
                break;
            }
        }

        clock_gettime(CLOCK_MONOTONIC, &start);

        // Note processing status, current input block
        guppi_status_lock_safe(&st);
        hputs(st.buf, STATUS_KEY, "processing");
        hputi4(st.buf, "OUTBLKIN", block_idx);
        guppi_status_unlock_safe(&st);

        // Update header's timestamp for this dump
        pkt.hdr.timestamp = TIMESTAMP(db->block[block_idx].header.mcnt *
            N_TIME_PER_PACKET * 2 * N_CHAN_TOTAL / 128);

        // Init header's offset for this dump
        uint32_t nbytes = 0;
        pkt.hdr.offset = OFFSET(nbytes);

        // Unpack and convert in packet sized chunks
        float * pf_re  = db->block[block_idx].data;
        float * pf_im  = db->block[block_idx].data + xgpu_info.matLength;
        pktdata_t * p_out = pkt.data;
        for(casper_chan=0; casper_chan<N_CHAN_PER_X; casper_chan++) {
          // De-interleave the channels
          gpu_chan = (casper_chan/Nc) + ((casper_chan%Nc)*Nx);
          for(baseline=0; baseline<CASPER_CHAN_LENGTH; baseline++) {
            off_t idx_regtile = idx_map[baseline];
            pktdata_t re = CONVERT(pf_re[gpu_chan*REGTILE_CHAN_LENGTH+idx_regtile]);
            pktdata_t im = CONVERT(pf_im[gpu_chan*REGTILE_CHAN_LENGTH+idx_regtile]);
            *p_out++ = re;
            *p_out++ = -im; // Conjugate data to match downstream expectations
            nbytes += 2*sizeof(pktdata_t);
            if(nbytes % BYTES_PER_PACKET == 0) {
              int bytes_sent = send(sockfd, &pkt, sizeof(pkt.hdr)+BYTES_PER_PACKET, 0);
              if(bytes_sent == -1) {
                // Send all packets even if cactcher is not listening (i.e. we
                // we get a connection refused error), but abort sending this
                // dump if we get any other error.
                if(errno != ECONNREFUSED) {
                  perror("send");
                  // Update stats
                  guppi_status_lock_safe(&st);
                  hputu4(st.buf, "OUTDUMPS", ++dumps);
                  hputr4(st.buf, "OUTSECS", 0.0);
                  hputr4(st.buf, "OUTMBPS", 0.0);
                  guppi_status_unlock_safe(&st);
                  // Break out of both for loops
                  goto done_sending;
                }
              } else if(bytes_sent != sizeof(pkt.hdr)+BYTES_PER_PACKET) {
                printf("only sent %d of %lu bytes!!!\n", bytes_sent, sizeof(pkt.hdr)+BYTES_PER_PACKET);
              }

              // Delay to prevent overflowing network TX queue
              nanosleep(&packet_delay, NULL);

              // Setup for next packet
              p_out = pkt.data;
              // Update header's byte_offset for this chunk
              pkt.hdr.offset = OFFSET(nbytes);
            }
          }
        }

        clock_gettime(CLOCK_MONOTONIC, &stop);

        guppi_status_lock_safe(&st);
        hputu4(st.buf, "OUTDUMPS", ++dumps);
        hputr4(st.buf, "OUTSECS", (float)ELAPSED_NS(start,stop)/1e9);
        hputr4(st.buf, "OUTMBPS", (1e3*8*bytes_per_dump)/ELAPSED_NS(start,stop));
        guppi_status_unlock_safe(&st);

done_sending:

        // Mark block as free
        paper_output_databuf_set_free(db, block_idx);

        // Setup for next block
        block_idx = (block_idx + 1) % db->header.n_block;

        /* Will exit if thread has been cancelled */
        pthread_testcancel();
    }

    // Have to close all pushes
    THREAD_RUN_DETACH_DATAUF;
    THREAD_RUN_DETACH_STATUS;
    THREAD_RUN_END;

    // Thread success!
    return NULL;
}

static pipeline_thread_module_t module = {
    name: "paper_gpu_output_thread",
    type: PIPELINE_OUTPUT_THREAD,
    init: init,
    run:  run
};
Exemple #9
0
static void *run(void * _args)
{
    // Cast _args
    struct guppi_thread_args *args = (struct guppi_thread_args *)_args;

#ifdef DEBUG_SEMS
    fprintf(stderr, "s/tid %lu/NET/' <<.\n", pthread_self());
#endif

    THREAD_RUN_BEGIN(args);

    THREAD_RUN_SET_AFFINITY_PRIORITY(args);

    THREAD_RUN_ATTACH_STATUS(args->instance_id, st);
    st_p = &st;		// allow global (this source file) access to the status buffer

    /* Attach to paper_input_databuf */
    THREAD_RUN_ATTACH_DATABUF(args->instance_id, paper_input_databuf, db, args->output_buffer);

    /* Read in general parameters */
    struct guppi_params gp;
    struct sdfits pf;
    char status_buf[GUPPI_STATUS_SIZE];
    guppi_status_lock_busywait_safe(st_p);
    memcpy(status_buf, st_p->buf, GUPPI_STATUS_SIZE);
    guppi_status_unlock_safe(st_p);
    guppi_read_obs_params(status_buf, &gp, &pf);
    pthread_cleanup_push((void *)guppi_free_sdfits, &pf);

    /* Read network params */
    struct guppi_udp_params up;
    //guppi_read_net_params(status_buf, &up);
    paper_read_net_params(status_buf, &up);
    // Store bind host/port info etc in status buffer
    guppi_status_lock_busywait_safe(&st);
    hputs(st.buf, "BINDHOST", up.bindhost);
    hputi4(st.buf, "BINDPORT", up.bindport);
    hputu4(st.buf, "MISSEDFE", 0);
    hputu4(st.buf, "MISSEDPK", 0);
    hputs(st.buf, STATUS_KEY, "running");
    guppi_status_unlock_safe(&st);

    struct guppi_udp_packet p;

    /* Give all the threads a chance to start before opening network socket */
    sleep(1);


#ifndef TIMING_TEST
    /* Set up UDP socket */
    int rv = guppi_udp_init(&up);
    if (rv!=GUPPI_OK) {
        guppi_error("guppi_net_thread",
                "Error opening UDP socket.");
        pthread_exit(NULL);
    }
    pthread_cleanup_push((void *)guppi_udp_close, &up);
#endif

    /* Main loop */
    uint64_t packet_count = 0;
    uint64_t elapsed_wait_ns = 0;
    uint64_t elapsed_recv_ns = 0;
    uint64_t elapsed_proc_ns = 0;
    float ns_per_wait = 0.0;
    float ns_per_recv = 0.0;
    float ns_per_proc = 0.0;
    struct timespec start, stop;
    struct timespec recv_start, recv_stop;
    signal(SIGINT,cc);
    while (run_threads) {

#ifndef TIMING_TEST
        /* Read packet */
	clock_gettime(CLOCK_MONOTONIC, &recv_start);
	do {
	    clock_gettime(CLOCK_MONOTONIC, &start);
	    p.packet_size = recv(up.sock, p.data, GUPPI_MAX_PACKET_SIZE, 0);
	    clock_gettime(CLOCK_MONOTONIC, &recv_stop);
	} while (p.packet_size == -1 && (errno == EAGAIN || errno == EWOULDBLOCK) && run_threads);
	if(!run_threads) break;
        if (up.packet_size != p.packet_size) {
            if (p.packet_size != -1) {
                #ifdef DEBUG_NET
                guppi_warn("guppi_net_thread", "Incorrect pkt size");
                #endif
                continue; 
            } else {
                guppi_error("guppi_net_thread", 
                        "guppi_udp_recv returned error");
                perror("guppi_udp_recv");
                pthread_exit(NULL);
            }
        }
#endif
	packet_count++;

        // Copy packet into any blocks where it belongs.
        const uint64_t mcnt = write_paper_packet_to_blocks((paper_input_databuf_t *)db, &p);

	clock_gettime(CLOCK_MONOTONIC, &stop);
	elapsed_wait_ns += ELAPSED_NS(recv_start, start);
	elapsed_recv_ns += ELAPSED_NS(start, recv_stop);
	elapsed_proc_ns += ELAPSED_NS(recv_stop, stop);

        if(mcnt != -1) {
            // Update status
            ns_per_wait = (float)elapsed_wait_ns / packet_count;
            ns_per_recv = (float)elapsed_recv_ns / packet_count;
            ns_per_proc = (float)elapsed_proc_ns / packet_count;
            guppi_status_lock_busywait_safe(&st);
            hputu8(st.buf, "NETMCNT", mcnt);
	    // Gbps = bits_per_packet / ns_per_packet
	    // (N_BYTES_PER_PACKET excludes header, so +8 for the header)
            hputr4(st.buf, "NETGBPS", 8*(N_BYTES_PER_PACKET+8)/(ns_per_recv+ns_per_proc));
            hputr4(st.buf, "NETWATNS", ns_per_wait);
            hputr4(st.buf, "NETRECNS", ns_per_recv);
            hputr4(st.buf, "NETPRCNS", ns_per_proc);
            guppi_status_unlock_safe(&st);
	    // Start new average
	    elapsed_wait_ns = 0;
	    elapsed_recv_ns = 0;
	    elapsed_proc_ns = 0;
	    packet_count = 0;
        }

#if defined TIMING_TEST || defined NET_TIMING_TEST

#define END_LOOP_COUNT (1*1000*1000)
	static int loop_count=0;
	static struct timespec tt_start, tt_stop;
	if(loop_count == 0) {
	    clock_gettime(CLOCK_MONOTONIC, &tt_start);
	}
	//if(loop_count == 1000000) run_threads = 0; 
	if(loop_count == END_LOOP_COUNT) {
	    clock_gettime(CLOCK_MONOTONIC, &tt_stop);
	    int64_t elapsed = ELAPSED_NS(tt_start, tt_stop);
	    printf("processed %d packets in %.6f ms (%.3f us per packet)\n",
		    END_LOOP_COUNT, elapsed/1e6, elapsed/1e3/END_LOOP_COUNT);
	    exit(0);
	}
	loop_count++;
#endif

        /* Will exit if thread has been cancelled */
        pthread_testcancel();
    }

    /* Have to close all push's */
#ifndef TIMING_TEST
    pthread_cleanup_pop(0); /* Closes push(guppi_udp_close) */
#endif
    pthread_cleanup_pop(0); /* Closes guppi_free_psrfits */
    THREAD_RUN_DETACH_DATAUF;
    THREAD_RUN_DETACH_STATUS;
    THREAD_RUN_END;

    return NULL;
}