struct rf_packet *cb_peek(struct rf_packet_buffer *b) { struct rf_packet *ret = NULL; if (!cb_is_empty(b)) { ret = &b->elems[b->start]; } return ret; }
struct rf_packet *cb_read(struct rf_packet_buffer *b) { struct rf_packet *ret = NULL; if (!cb_is_empty(b)) { ret = &b->elems[b->start]; b->start = (b->start + 1) % b->size; b->count--; } return ret; }
int main(int argc, char ** argv) { circular_buffer cb; elem_type elem = {0}; int test_buffer_size = 10; /*arbitrary size*/ cbinit(&cb, test_buffer_size); for(elem.value = 0; elem.value < 3*test_buffer_size; elem.value++) { cb_write(&cb, &elem); } while(!cb_is_empty(&cb)) { cb_read(&cb, &elem); printf("%d\n",elem.value); } cbfree(&cb); return 0; }
/** * Processes all buffers on the CPU. * * @param *tree_record Pointer to struct instance storing the model * @param *params Pointer to struct instance storing all model parameters */ void process_all_buffers_cpu(TREE_RECORD *tree_record, TREE_PARAMETERS *params) { INT_TYPE no_input_patterns_left = (tree_record->current_test_index == tree_record->nXtest && cb_is_empty(&(tree_record->queue_reinsert))); if (tree_record->buffer_full_warning || no_input_patterns_left) { START_MY_TIMER(tree_record->timers + 11); tree_record->empty_all_buffers_calls++; UINT_TYPE leaf_idx; UINT_TYPE i; // get the total number of indices removed in this round (needed for space allocation) INT_TYPE total_number_test_indices_removed = 0; for (leaf_idx = 0; leaf_idx < tree_record->n_leaves; leaf_idx++) { if (cb_get_number_items(tree_record->buffers[leaf_idx]) > 0) { // we can always empty ALL buffers, no overhead! total_number_test_indices_removed += cb_get_number_items( tree_record->buffers[leaf_idx]); } } // intialize arrays that need to be transferred to the GPU INT_TYPE *test_indices_removed_from_all_buffers = (INT_TYPE *) malloc( total_number_test_indices_removed * sizeof(INT_TYPE)); INT_TYPE *fr_indices = (INT_TYPE *) malloc( total_number_test_indices_removed * sizeof(INT_TYPE)); INT_TYPE *to_indices = (INT_TYPE *) malloc( total_number_test_indices_removed * sizeof(INT_TYPE)); INT_TYPE number_added_test_elements = 0; INT_TYPE number_t_indices_in_buffer; for (leaf_idx = 0; leaf_idx < tree_record->n_leaves; leaf_idx++) { if (cb_get_number_items(tree_record->buffers[leaf_idx]) > 0) { // get number of indices that are in the buffer number_t_indices_in_buffer = cb_get_number_items( tree_record->buffers[leaf_idx]); // read indices from buffer cb_read_batch(tree_record->buffers[leaf_idx], test_indices_removed_from_all_buffers + number_added_test_elements, number_t_indices_in_buffer); // we generate copies of data items that are needed by ALL KERNELS (to avoid bank conflicts) // Thus, we are transferring more elements as needed to the GPU for (i = 0; i < number_t_indices_in_buffer; i++) { fr_indices[number_added_test_elements + i] = tree_record->leaves[leaf_idx * LEAF_WIDTH]; // fr_idx to_indices[number_added_test_elements + i] = tree_record->leaves[leaf_idx * LEAF_WIDTH + 1]; // to_idx // reinsert test pattern into test queue: can already be done here, we have to wait at the end once... cb_add_elt(&(tree_record->queue_reinsert), test_indices_removed_from_all_buffers + number_added_test_elements + i); } // increase number of added elements (needed as offset) number_added_test_elements += number_t_indices_in_buffer; } } // all buffers are empty now tree_record->buffer_full_warning = 0; STOP_MY_TIMER(tree_record->timers + 11); START_MY_TIMER(tree_record->timers + 12); do_bruteforce_all_leaves_cpu(test_indices_removed_from_all_buffers, total_number_test_indices_removed, fr_indices, to_indices, tree_record, params); STOP_MY_TIMER(tree_record->timers + 12); // free memory free(test_indices_removed_from_all_buffers); free(fr_indices); free(to_indices); // all buffers are empty now: let's check if enough work is still there for another round! INT_TYPE num_elts_in_queue = cb_get_number_items(&(tree_record->queue_reinsert)); if (tree_record->current_test_index == tree_record->nXtest && num_elts_in_queue < params->bf_remaining_threshold) { START_MY_TIMER(tree_record->timers + 12); process_queue_via_brute_force_cpu(tree_record, params); STOP_MY_TIMER(tree_record->timers + 12); } } }
/** * Interface (extern): Computes the k nearest neighbors for a given set of test points * stored in *Xtest and stores the results in two arrays *distances and *indices. * * @param *Xtest Pointer to the set of query/test points (stored as FLOAT_TYPE) * @param nXtest The number of query points * @param dXtest The dimension of each query point * @param *distances The distances array (FLOAT_TYPE) used to store the computed distances * @param ndistances The number of query points * @param ddistances The number of distance values for each query point * @param *indices Pointer to arrray storing the indices of the k nearest neighbors for each query point * @param nindices The number of query points * @param dindices The number of indices comptued for each query point * @param *tree_record Pointer to struct storing all relevant information for model * @param *params Pointer to struct containing all relevant parameters * */ void neighbors_extern(FLOAT_TYPE * Xtest, INT_TYPE nXtest, INT_TYPE dXtest, FLOAT_TYPE *distances, INT_TYPE ndistances, INT_TYPE ddistances, INT_TYPE *indices, INT_TYPE nindices, INT_TYPE dindices, TREE_RECORD *tree_record, TREE_PARAMETERS *params) { START_MY_TIMER(tree_record->timers + 1); UINT_TYPE i, j; tree_record->find_leaf_idx_calls = 0; tree_record->empty_all_buffers_calls = 0; tree_record->Xtest = Xtest; tree_record->nXtest = nXtest; tree_record->dist_mins_global = distances; tree_record->idx_mins_global = indices; long device_mem_bytes = tree_record->device_infos.device_mem_bytes; double test_mem_bytes = get_test_tmp_mem_device_bytes(tree_record, params); PRINT(params)("Memory needed for test patterns: %f (GB)\n", test_mem_bytes / MEM_GB); if (test_mem_bytes > device_mem_bytes * params->allowed_test_mem_percent) { PRINT(params)("Too much memory used for test patterns and temporary data!\n"); FREE_OPENCL_DEVICES(tree_record, params); exit(EXIT_FAILURE); } double total_device_bytes = get_total_mem_device_bytes(tree_record, params); PRINT(params)("Total memory needed on device: %f (GB)\n", total_device_bytes / MEM_GB); START_MY_TIMER(tree_record->timers + 4); /* ------------------------------------- OPENCL -------------------------------------- */ INIT_ARRAYS(tree_record, params); /* ------------------------------------- OPENCL -------------------------------------- */ // initialize leaf buffer for test queries (circular buffers) tree_record->buffers = (circular_buffer **) malloc(tree_record->n_leaves * sizeof(circular_buffer*)); for (i = 0; i < tree_record->n_leaves; i++) { tree_record->buffers[i] = (circular_buffer *) malloc(sizeof(circular_buffer)); cb_init(tree_record->buffers[i], tree_record->leaves_initial_buffer_sizes); } tree_record->buffer_full_warning = 0; // initialize queue "input" (we can have at most number_test_patterns in there) cb_init(&(tree_record->queue_reinsert), tree_record->nXtest); /* ------------------------------------- OPENCL -------------------------------------- */ START_MY_TIMER(tree_record->timers + 3); ALLOCATE_MEMORY_OPENCL_DEVICES(tree_record, params); STOP_MY_TIMER(tree_record->timers + 3); /* ------------------------------------- OPENCL -------------------------------------- */ UINT_TYPE iter = 0; UINT_TYPE test_printed = 0; // allocate space for the indices added in each round; we cannot have more than original test patterns ... INT_TYPE *all_next_indices = (INT_TYPE *) malloc( tree_record->approx_number_of_avail_buffer_slots * sizeof(INT_TYPE)); // allocate space for all return values (by FIND_LEAF_IDX_BATCH) tree_record->leaf_indices_batch_ret_vals = (INT_TYPE *) malloc( tree_record->approx_number_of_avail_buffer_slots * sizeof(INT_TYPE)); UINT_TYPE num_elts_added; tree_record->current_test_index = 0; INT_TYPE reinsert_counter = 0; PRINT(params)("Starting Querying process via buffer tree...\n"); STOP_MY_TIMER(tree_record->timers + 4); START_MY_TIMER(tree_record->timers + 2); do { iter++; // try to get elements from both queues until buffers are full // (each buffer is either empty or has at least space for leaves_buffer_sizes_threshold elements) num_elts_added = 0; // add enough elements to the buffers ("batch filling") while (num_elts_added < tree_record->approx_number_of_avail_buffer_slots && (tree_record->current_test_index < tree_record->nXtest || !cb_is_empty(&(tree_record->queue_reinsert)))) { // we remove indices from both queues here (add one element from each queue, if not empty) if (!cb_is_empty(&(tree_record->queue_reinsert))) { cb_read(&(tree_record->queue_reinsert), all_next_indices + num_elts_added); } else { all_next_indices[num_elts_added] = tree_record->current_test_index; tree_record->current_test_index++; } num_elts_added++; } /* ------------------------------------- OPENCL -------------------------------------- */ FIND_LEAF_IDX_BATCH(all_next_indices, num_elts_added, tree_record->leaf_indices_batch_ret_vals, tree_record, params); /* ------------------------------------- OPENCL -------------------------------------- */ // we have added num_elts_added indices to the all_next_indices array for (j = 0; j < num_elts_added; j++) { INT_TYPE leaf_idx = tree_record->leaf_indices_batch_ret_vals[j]; // if not done: add the index to the appropriate buffer if (leaf_idx != -1) { // enlarge buffer if needed if (cb_is_full(tree_record->buffers[leaf_idx])) { PRINT(params)("Increasing buffer size ...\n"); tree_record->buffers[leaf_idx] = cb_double_size(tree_record->buffers[leaf_idx]); } // add next_indices[j] to buffer leaf_idx cb_write(tree_record->buffers[leaf_idx], all_next_indices + j); if (cb_get_number_items(tree_record->buffers[leaf_idx]) >= tree_record->leaves_buffer_sizes_threshold) { tree_record->buffer_full_warning = 1; } } // else: traversal of test pattern has reached root: done! } /* ------------------------------------- OPENCL -------------------------------------- */ PROCESS_ALL_BUFFERS(tree_record, params); /* ------------------------------------- OPENCL -------------------------------------- */ if (tree_record->current_test_index == tree_record->nXtest && !test_printed) { PRINT(params)("All query indices are in the buffer tree now (buffers or reinsert queue)...\n"); test_printed = 1; } } while (tree_record->current_test_index < tree_record->nXtest || !cb_is_empty(&(tree_record->queue_reinsert))); STOP_MY_TIMER(tree_record->timers + 2); START_MY_TIMER(tree_record->timers + 5); /* ------------------------------------- OPENCL -------------------------------------- */ GET_DISTANCES_AND_INDICES(tree_record, params); /* ------------------------------------- OPENCL -------------------------------------- */ // free space generated by testing for (i = 0; i < tree_record->n_leaves; i++) { cb_free(tree_record->buffers[i]); } STOP_MY_TIMER(tree_record->timers + 5); STOP_MY_TIMER(tree_record->timers + 1); PRINT(params)("Buffer full indices (overhead)=%i\n", reinsert_counter); PRINT(params)("\nNumber of iterations in while loop: \t\t\t\t\t\t\t%i\n", iter); PRINT(params)("Number of empty_all_buffers calls: \t\t\t\t\t\t\t%i\n", tree_record->empty_all_buffers_calls); PRINT(params)("Number of find_leaf_idx_calls: \t\t\t\t\t\t\t\t%i\n\n", tree_record->find_leaf_idx_calls); PRINT(params)("Elapsed total time for querying: \t\t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 1)); PRINT(params)("-----------------------------------------------------------------------------------------------------------------------------\n"); PRINT(params)("(Overhead) Elapsed time for BEFORE WHILE: \t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 4)); PRINT(params)("(Overhead) -> ALLOCATE_MEMORY_OPENCL_DEVICES: \t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 3)); PRINT(params)( "-----------------------------------------------------------------------------------------------------------------------------\n"); PRINT(params)("Elapsed time in while-loop: \t\t\t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 2)); PRINT(params)("(I) Elapsed time for PROCESS_ALL_BUFFERS: \t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 12)); PRINT(params)("(I.A) Function: retrieve_indices_from_buffers_gpu: \t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 11)); PRINT(params)("(I.B) Do brute-force (do_brute.../process_buffers_...chunks_gpu : \t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 18)); PRINT(params)("(I.B.1) -> Elapsed time for clEnqueueWriteBuffer (INTERLEAVED): \t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 19)); PRINT(params)("(I.B.1) -> Elapsed time for memcpy (INTERLEAVED): \t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 21)); PRINT(params)("(I.B.1) -> Elapsed time for waiting for chunk (in seconds): \t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 22)); PRINT(params)("(I.B.2) -> Number of copy calls: %i\n", tree_record->counters[0]); if (!training_chunks_inactive(tree_record, params)) { PRINT(params)("(I.B.4) -> Overhead distributing indices to chunks (in seconds): \t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 23)); PRINT(params)("(I.B.5) -> Processing of whole chunk (all three phases, in seconds): \t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 24)); PRINT(params)("(I.B.6) -> Processing of chunk before brute (in seconds): \t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 25)); PRINT(params)("(I.B.7) -> Processing of chunk after brute (in seconds): \t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 26)); PRINT(params)("(I.B.8) -> Processing of chunk after brute, buffer release (in seconds): \t%2.10f\n", GET_MY_TIMER(tree_record->timers + 27)); PRINT(params)("(I.B.9) -> Number of release buffer calls: %i\n", tree_record->counters[0]); } if (USE_GPU) { PRINT(params)("(I.B.3) -> Elapsed time for TEST_SUBSET (in seconds): \t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 13)); PRINT(params)("(I.B.4) -> Elapsed time for NN Search (in seconds): \t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 14)); PRINT(params)("(I.B.5) -> Elapsed time for UPDATE (in seconds): \t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 15)); PRINT(params)("(I.B.6) -> Elapsed time for OVERHEAD (in seconds): \t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 12) - GET_MY_TIMER(tree_record->timers + 14) - GET_MY_TIMER(tree_record->timers + 15) - GET_MY_TIMER(tree_record->timers + 13)); } PRINT(params)("(II) FIND_LEAF_IDX_BATCH : \t\t\t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 16)); PRINT(params)("(III) Elapsed time for final brute-force step : \t\t\t\t%2.10f\n\n", GET_MY_TIMER(tree_record->timers + 20)); PRINT(params)("-----------------------------------------------------------------------------------------------------------------------------\n"); PRINT(params)("(DIFF) While - PROCESS_ALL_BUFFERS - FIND_LEAF_IDX_BATCH: \t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 2) - GET_MY_TIMER(tree_record->timers + 12) - GET_MY_TIMER(tree_record->timers + 16)); PRINT(params)("(Overhead) Elapsed time for AFTER WHILE : \t\t\t\t\t%2.10f\n", GET_MY_TIMER(tree_record->timers + 5)); PRINT(params)("-----------------------------------------------------------------------------------------------------------------------------\n\n"); PRINT(params)("-----------------------------------------------------------------------------------------------------------------------------\n"); PRINT(params)("QUERY RUNTIME: %2.10f ", GET_MY_TIMER(tree_record->timers + 1)); PRINT(params)("PROCESS_ALL_BUFFERS: %2.10f ", GET_MY_TIMER(tree_record->timers + 12)); PRINT(params)("FIND_LEAF_IDX_BATCH: %2.10f ", GET_MY_TIMER(tree_record->timers + 16)); PRINT(params)("WHILE_OVERHEAD: %2.10f ", GET_MY_TIMER(tree_record->timers + 2) - GET_MY_TIMER(tree_record->timers + 12) - GET_MY_TIMER(tree_record->timers + 16)); PRINT(params)("\n"); PRINT(params)("-----------------------------------------------------------------------------------------------------------------------------\n"); // free all allocated memory related to querying for (i = 0; i < tree_record->n_leaves; i++) { free(tree_record->buffers[i]); } free(tree_record->buffers); // free arrays free(tree_record->all_stacks); free(tree_record->all_depths); free(tree_record->all_idxs); free(all_next_indices); free(tree_record->leaf_indices_batch_ret_vals); }