int main(int argc, char* argv[]) { std::ifstream in; openFile(argv[1],in); auto us = readFile<std::unordered_set<std::string>>(in); std::cout << "Read " << us.size() << " words from " << argv[1] << ".\n\n"; std::cout << "Hashtable load factor is: " << us.load_factor() << ".\n"; auto bc = us.bucket_count(); std::cout << "Bucket count is: " << us.bucket_count() << ".\n"; for (int b = 0; b < bc; ++b) { if (us.bucket_size(b)) { std::cout << "Bucket " << b << " contains " << us.bucket_size(b) << " items.\n"; if (us.bucket_size(b) > 1) { std::copy(us.cbegin(b),us.cend(b), std::ostream_iterator<std::string>(std::cout," ")); std::cout << std::endl; } } } }
/* Calculates approximation of the percentile of the original distribution * The quality of the value depends on how much information was lost when creating the histogram * Inside a bucket, we use linear interpolation */ double percentile( double p ) { assert( p >= 0.0 && p <= 1.0 && "p must be within [0.0 1.0]" ); if ( !num_entries() ) return 0.0; size_t target = static_cast<size_t>( p * num_entries() ); // Performance Optimization: We assume a roughly balanced distribution, // so for p <= 0.5 we start from min counting upwards, otherwise from max counting downwards if ( p <= 0.5 ) { size_t count = 0; for ( size_t i = 0, size = data().size(); i < size; ++i ) { count += data()[ i ]; if ( count >= target ) { // We reached the target bucket. // Calculate linear interpolation x double x = data()[ i ] ? ( count - target ) / data()[ i ] : 0.0; assert( x >= 0.0 && x <= 1.0 ); // Return result return _min + ( i + x ) * bucket_size(); } } } else { size_t count = num_entries(); for ( int i = static_cast< int >( data().size() ) - 1; i >= 0; --i ) { count -= data()[ i ]; if ( count <= target ) { // We reached the target bucket. // Calculate linear interpolation x double x = data()[ i ] ? ( target - count ) / data()[ i ] : 0.0; assert( x >= 0.0 && x <= 1.0 ); // Return result return _max - ( i - x ) * bucket_size(); } } } assert( false ); return 0.0; }
/* implement the 'g' graphing command */ void do_graph1(csv_t *D, int col) { /* fix column number to match array indexing */ int array_col=col-1; row_buckets_t graph_buckets; int graph_values[GRAPHROWS] = {0}; /* determine the min and max of the column */ graph_buckets.min = find_min(D, array_col); graph_buckets.max = find_max(D, array_col); /* use the min and max to compute the size of the buckets */ graph_buckets.bucket_step_size = bucket_size(graph_buckets.max, graph_buckets.min, GRAPHROWS); /* fill an array of buckets, where the value of each index is the lower end of the bucket range */ row_bucket_values(&graph_buckets); /* fill an array determining how many values are in each bucket */ fill_buckets(&graph_buckets, D, array_col, graph_values); /* print the graph of bucket quantities per bucket value */ print_bucket_graph(&graph_buckets, D->labs[col-1], graph_values); }
/* * Rebuilds array. */ static void rebuild_array(struct bucket **done, int *array) { int j; /* array[] offset. */ int i, k; /* Loop index. */ #define BUCKETS_PER_CORE (NUM_BUCKETS/NUM_IO_CORES) /* Spawn threads. */ j = 0; for (i = 0; i < NUM_IO_CORES; i++) { tdata[i].args.i0 = i*BUCKETS_PER_CORE; tdata[i].args.in = (i + 1)*BUCKETS_PER_CORE; tdata[i].args.done = done; tdata[i].args.array = array; pthread_create(&tdata[i].tid, NULL, thread_main, (void *)&tdata[i]); for (k = i*BUCKETS_PER_CORE; k < (i + 1)*BUCKETS_PER_CORE; k++) j += bucket_size(done[k]); } /* Join threads. */ for (i = 0; i < NUM_IO_CORES; i++) pthread_join(tdata[i].tid, NULL); }
/* implement the 'p' plot command to generate a 2d graph showing correlation between two columns */ void do_graph2(csv_t *D, int col1, int col2) { /* fix columns to match array indexing */ int array_col1 = col1 - 1; int array_col2 = col2 - 1; row_buckets_t vert_buckets; col_buckets_t horiz_buckets; /* determine the min and max of the columns */ vert_buckets.min = find_min(D, array_col1); vert_buckets.max = find_max(D, array_col1); horiz_buckets.min = find_min(D, array_col2); horiz_buckets.max = find_max(D, array_col2); /* use the min and max to compute the size of the buckets */ vert_buckets.bucket_step_size = bucket_size(vert_buckets.max, vert_buckets.min, GRAPHROWS); horiz_buckets.bucket_step_size = bucket_size(horiz_buckets.max, horiz_buckets.min, GRAPHCOLS); /* fill an array of buckets, where the value of each index is the lower end of the bucket range */ row_bucket_values(&vert_buckets); col_bucket_values(&horiz_buckets); /* fill 2D array with data points in correct bucketed values */ int plot_quantities[GRAPHROWS][GRAPHCOLS] = {{0}}; fill_plot_array(&vert_buckets, &horiz_buckets, D, array_col1, array_col2, plot_quantities); /* print 2D plot */ print_2d_plot(D, &vert_buckets, array_col1, array_col2,plot_quantities); return; }
/* * Thread's main. */ static void *thread_main(void *args) { int i, j; /* Loop indexes. */ struct tdata *t; /* Thread's data. */ t = args; /* Rebuild array. */ j = t->args.j0; for (i = t->args.i0; i < t->args.in; i++) { bucket_merge(t->args.done[i], &t->args.array[j]); j += bucket_size(t->args.done[i]); } pthread_exit(NULL); return (NULL); }
bucket_t* bucket_split(bucket_t* bucket1, bool (*split_funct)(void* data, unsigned int key), void* data) { assert(bucket1 != NULL && split_funct != NULL); unsigned int i; unsigned int curr_node_size; bool error = false; bucket_t* bucket2 = NULL; bucket_t** dst_bucket_ptr = NULL; bucket_t* src_bucket = NULL; bucket_node_t* src_bucket_node = NULL; /* Allocate memory for the new bucket */ if(!error && (bucket2 = bucket_init(bucket1->node_capacity)) == NULL) error = true; /* Make a copy of the old bucket's head */ if((src_bucket = bucket_init(bucket1->node_capacity)) == NULL) error = true; if(!error) { /* Move internal data from the old bucket' head to the new bucket head. No deep copy is done, the chain is moved too. */ *src_bucket = *bucket1; /* Reset the given bucket's head */ *bucket1 = *bucket2; /* Initialize src_bucket_node */ src_bucket_node = src_bucket->chain; } /* For every bucket_node in the chain... */ while(!error && src_bucket_node != NULL) { /* Get the current bucket_node's size */ curr_node_size = bucket_node_size(src_bucket, src_bucket_node); /* For every entry in the bucket_node... */ for(i=0; !error && i < curr_node_size; i++) { /* Call split_funct to get the target bucket */ if(!(*split_funct)(data, src_bucket_node->key[i])) dst_bucket_ptr = &bucket1; else dst_bucket_ptr = &bucket2; /* Append to the appropriate new bucket (entries are already sorted) */ if(!bucket_node_insert(*dst_bucket_ptr, bucket_size(*dst_bucket_ptr), src_bucket_node->key[i], src_bucket_node->value[i])) error = true; } src_bucket_node = src_bucket_node->overflow; } if(error) { /* Restore the old bucket in its original condition */ *bucket1 = *src_bucket; /* Free allocated memory */ bucket_destroy(&bucket2); /* Prevent old bucket chain from being destroyed */ src_bucket->chain = NULL; } /* Destroy the old bucket */ bucket_destroy(&src_bucket); return bucket2; }
/* * Bucket-sort algorithm. */ extern void bucketsort(int *array, int n) { int max; /* Maximum number. */ int i, j; /* Loop indexes. */ int range; /* Bucket range. */ struct minibucket *minib; /* Working mini-bucket. */ struct message *msg; /* Working message. */ struct bucket **todo; /* Todo buckets. */ struct bucket **done; /* Done buckets. */ uint64_t start, end; /* Timers. */ /* Setup slaves. */ open_noc_connectors(); spawn_slaves(); sync_slaves(); todo = smalloc(NUM_BUCKETS*sizeof(struct bucket *)); done = smalloc(NUM_BUCKETS*sizeof(struct bucket *)); for (i = 0; i < NUM_BUCKETS; i++) { done[i] = bucket_create(); todo[i] = bucket_create(); } /* Find max number in the array. */ start = timer_get(); max = INT_MIN; for (i = 0; i < n; i++) { /* Found. */ if (array[i] > max) max = array[i]; } /* Distribute numbers. */ range = max/NUM_BUCKETS; for (i = 0; i < n; i++) { j = array[i]/range; if (j >= NUM_BUCKETS) j = NUM_BUCKETS - 1; bucket_insert(&todo[j], array[i]); } end = timer_get(); master += timer_diff(start, end); /* Sort buckets. */ j = 0; for (i = 0; i < NUM_BUCKETS; i++) { while (bucket_size(todo[i]) > 0) { minib = bucket_pop(todo[i]); /* Send message. */ msg = message_create(SORTWORK, i, minib->size); message_send(outfd[j], msg); message_destroy(msg); /* Send data. */ communication += data_send(outfd[j], minib->elements, minib->size*sizeof(int)); minibucket_destroy(minib); j++; /* * Slave processes are busy. * So let's wait for results. */ if (j == nclusters) { /* Receive results. */ for (/* NOOP */ ; j > 0; j--) { /* Receive message. */ msg = message_receive(infd[nclusters - j]); /* Receive mini-bucket. */ minib = minibucket_create(); minib->size = msg->u.sortresult.size; communication += data_receive(infd[nclusters -j], minib->elements, minib->size*sizeof(int)); bucket_push(done[msg->u.sortresult.id], minib); message_destroy(msg); } } } } /* Receive results. */ for (/* NOOP */ ; j > 0; j--) { /* Receive message. */ msg = message_receive(infd[j - 1]); /* Receive bucket. */ minib = minibucket_create(); minib->size = msg->u.sortresult.size; communication += data_receive(infd[j - 1], minib->elements, minib->size*sizeof(int)); bucket_push(done[msg->u.sortresult.id], minib); message_destroy(msg); } start = timer_get(); rebuild_array(done, array); end = timer_get(); master += timer_diff(start, end); /* House keeping. */ for (i = 0; i < NUM_BUCKETS; i++) { bucket_destroy(todo[i]); bucket_destroy(done[i]); } free(done); free(todo); join_slaves(); close_noc_connectors(); }