int flag_gpu_input_databuf_wait_filled(flag_gpu_input_databuf_t * d, int block_id) { return hashpipe_databuf_wait_filled((hashpipe_databuf_t *)d, block_id); }
int flag_correlator_output_databuf_wait_filled(flag_correlator_output_databuf_t * d, int block_id) { return hashpipe_databuf_wait_filled((hashpipe_databuf_t *)d, block_id); }
static void *run(hashpipe_thread_args_t * args, int doCPU) { // Local aliases to shorten access to args fields paper_gpu_input_databuf_t *db_in = (paper_gpu_input_databuf_t *)args->ibuf; paper_output_databuf_t *db_out = (paper_output_databuf_t *)args->obuf; hashpipe_status_t st = args->st; const char * status_key = args->thread_desc->skey; #ifdef DEBUG_SEMS fprintf(stderr, "s/tid %lu/ GPU/\n", pthread_self()); #endif // Init integration control status variables int gpu_dev = 0; hashpipe_status_lock_safe(&st); hputs(st.buf, "INTSTAT", "off"); hputi8(st.buf, "INTSYNC", 0); hputi4(st.buf, "INTCOUNT", N_SUB_BLOCKS_PER_INPUT_BLOCK); hputi8(st.buf, "GPUDUMPS", 0); hgeti4(st.buf, "GPUDEV", &gpu_dev); // No change if not found hputi4(st.buf, "GPUDEV", gpu_dev); hashpipe_status_unlock_safe(&st); /* Loop */ int rv; char integ_status[17]; uint64_t start_mcount, last_mcount=0; uint64_t gpu_dumps=0; int int_count; // Number of blocks to integrate per dump int xgpu_error = 0; int curblock_in=0; int curblock_out=0; struct timespec start, stop; uint64_t elapsed_gpu_ns = 0; uint64_t gpu_block_count = 0; // Initialize context to point at first input and output memory blocks. // This seems redundant since we do this just before calling // xgpuCudaXengine, but we need to pass something in for array_h and // matrix_x to prevent xgpuInit from allocating memory. XGPUContext context; context.array_h = (ComplexInput *)db_in->block[0].data; context.array_len = (db_in->header.n_block * sizeof(paper_gpu_input_block_t) - sizeof(paper_input_header_t)) / sizeof(ComplexInput); context.matrix_h = (Complex *)db_out->block[0].data; context.matrix_len = (db_out->header.n_block * sizeof(paper_output_block_t) - sizeof(paper_output_header_t)) / sizeof(Complex); xgpu_error = xgpuInit(&context, gpu_dev); if (XGPU_OK != xgpu_error) { fprintf(stderr, "ERROR: xGPU initialization failed (error code %d)\n", xgpu_error); return THREAD_ERROR; } while (run_threads()) { // Note waiting status, // query integrating status // and, if armed, start count hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "waiting"); hgets(st.buf, "INTSTAT", 16, integ_status); hgeti8(st.buf, "INTSYNC", (long long*)&start_mcount); hashpipe_status_unlock_safe(&st); // Wait for new input block to be filled while ((rv=hashpipe_databuf_wait_filled((hashpipe_databuf_t *)db_in, curblock_in)) != HASHPIPE_OK) { if (rv==HASHPIPE_TIMEOUT) { hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "blocked_in"); hashpipe_status_unlock_safe(&st); continue; } else { hashpipe_error(__FUNCTION__, "error waiting for filled databuf"); pthread_exit(NULL); break; } } // Got a new data block, update status and determine how to handle it hashpipe_status_lock_safe(&st); hputi4(st.buf, "GPUBLKIN", curblock_in); hputu8(st.buf, "GPUMCNT", db_in->block[curblock_in].header.mcnt); hashpipe_status_unlock_safe(&st); // If integration status "off" if(!strcmp(integ_status, "off")) { // Mark input block as free and advance hashpipe_databuf_set_free((hashpipe_databuf_t *)db_in, curblock_in); curblock_in = (curblock_in + 1) % db_in->header.n_block; // Skip to next input buffer continue; } // If integration status is "start" if(!strcmp(integ_status, "start")) { // If buffer mcount < start_mcount (i.e. not there yet) if(db_in->block[curblock_in].header.mcnt < start_mcount) { // Drop input buffer // Mark input block as free and advance hashpipe_databuf_set_free((hashpipe_databuf_t *)db_in, curblock_in); curblock_in = (curblock_in + 1) % db_in->header.n_block; // Skip to next input buffer continue; // Else if mcount == start_mcount (time to start) } else if(db_in->block[curblock_in].header.mcnt == start_mcount) { // Set integration status to "on" // Read integration count (INTCOUNT) fprintf(stderr, "--- integration on ---\n"); strcpy(integ_status, "on"); hashpipe_status_lock_safe(&st); hputs(st.buf, "INTSTAT", integ_status); hgeti4(st.buf, "INTCOUNT", &int_count); hashpipe_status_unlock_safe(&st); // Compute last mcount last_mcount = start_mcount + (int_count-1) * N_SUB_BLOCKS_PER_INPUT_BLOCK; // Else (missed starting mcount) } else { // Handle missed start of integration // TODO! fprintf(stderr, "--- mcnt=%06lx > start_mcnt=%06lx ---\n", db_in->block[curblock_in].header.mcnt, start_mcount); } } // Integration status is "on" or "stop" // Note processing status hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "processing gpu"); hashpipe_status_unlock_safe(&st); // Setup for current chunk context.input_offset = curblock_in * sizeof(paper_gpu_input_block_t) / sizeof(ComplexInput); context.output_offset = curblock_out * sizeof(paper_output_block_t) / sizeof(Complex); // Call CUDA X engine function int doDump = 0; // Dump if this is the last block or we are doing both CPU and GPU // (GPU and CPU test mode always dumps every input block) if(db_in->block[curblock_in].header.mcnt >= last_mcount || doCPU) { doDump = 1; // Check whether we missed the end of integration. If we get a block // whose mcnt is greater than last_mcount, then for some reason (e.g. // networking problems) we didn't see a block whose mcnt was // last_mcount. This should "never" happen, but it has been seen to // occur when the 10 GbE links have many errors. if(db_in->block[curblock_in].header.mcnt > last_mcount) { // Can't do much error recovery, so just log it. fprintf(stderr, "--- mcnt=%06lx > last_mcnt=%06lx ---\n", db_in->block[curblock_in].header.mcnt, last_mcount); } // Wait for new output block to be free while ((rv=paper_output_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) { if (rv==HASHPIPE_TIMEOUT) { hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "blocked gpu out"); hashpipe_status_unlock_safe(&st); continue; } else { hashpipe_error(__FUNCTION__, "error waiting for free databuf"); pthread_exit(NULL); break; } } } clock_gettime(CLOCK_MONOTONIC, &start); xgpuCudaXengine(&context, doDump ? SYNCOP_DUMP : SYNCOP_SYNC_TRANSFER); clock_gettime(CLOCK_MONOTONIC, &stop); elapsed_gpu_ns += ELAPSED_NS(start, stop); gpu_block_count++; if(doDump) { clock_gettime(CLOCK_MONOTONIC, &start); xgpuClearDeviceIntegrationBuffer(&context); clock_gettime(CLOCK_MONOTONIC, &stop); elapsed_gpu_ns += ELAPSED_NS(start, stop); // TODO Maybe need to subtract all or half the integration time here // depending on recevier's expectations. db_out->block[curblock_out].header.mcnt = last_mcount; // If integration status if "stop" if(!strcmp(integ_status, "stop")) { // Set integration status to "off" strcpy(integ_status, "off"); hashpipe_status_lock_safe(&st); hputs(st.buf, "INTSTAT", integ_status); hashpipe_status_unlock_safe(&st); } else { // Advance last_mcount for end of next integration last_mcount += int_count * N_SUB_BLOCKS_PER_INPUT_BLOCK; } // Mark output block as full and advance paper_output_databuf_set_filled(db_out, curblock_out); curblock_out = (curblock_out + 1) % db_out->header.n_block; // TODO Need to handle or at least check for overflow! // Update GPU dump counter and GPU Gbps gpu_dumps++; hashpipe_status_lock_safe(&st); hputi8(st.buf, "GPUDUMPS", gpu_dumps); hputr4(st.buf, "GPUGBPS", (float)(8*N_FLUFFED_BYTES_PER_BLOCK*gpu_block_count)/elapsed_gpu_ns); hashpipe_status_unlock_safe(&st); // Start new average elapsed_gpu_ns = 0; gpu_block_count = 0; } if(doCPU) { /* Note waiting status */ hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "waiting"); hashpipe_status_unlock_safe(&st); // Wait for new output block to be free while ((rv=paper_output_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) { if (rv==HASHPIPE_TIMEOUT) { hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "blocked cpu out"); hashpipe_status_unlock_safe(&st); continue; } else { hashpipe_error(__FUNCTION__, "error waiting for free databuf"); pthread_exit(NULL); break; } } // Note "processing cpu" status, current input block hashpipe_status_lock_safe(&st); hputs(st.buf, status_key, "processing cpu"); hashpipe_status_unlock_safe(&st); /* * Call CPU X engine function */ xgpuOmpXengine((Complex *)db_out->block[curblock_out].data, context.array_h); // Mark output block as full and advance paper_output_databuf_set_filled(db_out, curblock_out); curblock_out = (curblock_out + 1) % db_out->header.n_block; // TODO Need to handle or at least check for overflow! } // Mark input block as free and advance hashpipe_databuf_set_free((hashpipe_databuf_t *)db_in, curblock_in); curblock_in = (curblock_in + 1) % db_in->header.n_block; /* Check for cancel */ pthread_testcancel(); } xgpuFree(&context); // Thread success! return NULL; }