/* * Generates uniformly random keys [0, MAX_KEY_VAL] on each rank using the time and rank * number as a seed */ static KEY_TYPE * make_input(void) { timer_start(&timers[TIMER_INPUT]); KEY_TYPE * restrict const my_keys = malloc(NUM_KEYS_PER_PE * sizeof(KEY_TYPE)); pcg32_random_t rng = seed_my_rank(); for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i) { my_keys[i] = pcg32_boundedrand_r(&rng, MAX_KEY_VAL); } timer_stop(&timers[TIMER_INPUT]); #ifdef DEBUG wait_my_turn(); char msg[1024]; const int my_rank = shmem_my_pe(); sprintf(msg,"Rank %d: Initial Keys: ", my_rank); for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", my_keys[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return my_keys; }
/* * Counts the occurence of each key in my bucket. * Key indices into the count array are the key's value minus my bucket's * minimum key value to allow indexing from 0. * my_bucket_keys: All keys in my bucket unsorted [my_rank * BUCKET_WIDTH, (my_rank+1)*BUCKET_WIDTH) */ static int * count_local_keys(KEY_TYPE const * const my_bucket_keys) { int * const my_local_key_counts = malloc(BUCKET_WIDTH * sizeof(int)); assert(my_local_key_counts); memset(my_local_key_counts, 0, BUCKET_WIDTH * sizeof(int)); timer_start(&timers[TIMER_SORT]); const int my_rank = shmem_my_pe(); const int my_min_key = my_rank * BUCKET_WIDTH; #ifdef ISX_PROFILING unsigned long long start = current_time_ns(); #endif // Count the occurences of each key in my bucket for(long long int i = 0; i < my_bucket_size; ++i){ const unsigned int key_index = my_bucket_keys[i] - my_min_key; assert(my_bucket_keys[i] >= my_min_key); assert(key_index < BUCKET_WIDTH); my_local_key_counts[key_index]++; } #ifdef ISX_PROFILING unsigned long long end = current_time_ns(); if (shmem_my_pe() == 0) printf("Counting local took %llu ns, my_bucket_size = %u, BUCKET_WIDTH = " "%llu\n", end - start, my_bucket_size, BUCKET_WIDTH); #endif timer_stop(&timers[TIMER_SORT]); #ifdef DEBUG wait_my_turn(); char msg[4096]; sprintf(msg,"Rank %d: Bucket Size %lld | Local Key Counts:", my_rank, my_bucket_size); for(uint64_t i = 0; i < BUCKET_WIDTH; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", my_local_key_counts[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return my_local_key_counts; }
/* * Places local keys into their corresponding local bucket. * The contents of each bucket are not sorted. */ static KEY_TYPE * bucketize_local_keys(KEY_TYPE const * const my_keys, int * const local_bucket_offsets) { KEY_TYPE * const my_local_bucketed_keys = malloc(NUM_KEYS_PER_PE * sizeof(KEY_TYPE)); assert(my_local_bucketed_keys); timer_start(&timers[TIMER_BUCKETIZE]); #ifdef ISX_PROFILING unsigned long long start = current_time_ns(); #endif for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){ const KEY_TYPE key = my_keys[i]; const uint32_t bucket_index = key / BUCKET_WIDTH; uint32_t index; assert(local_bucket_offsets[bucket_index] >= 0); index = local_bucket_offsets[bucket_index]++; assert(index < NUM_KEYS_PER_PE); my_local_bucketed_keys[index] = key; } #ifdef ISX_PROFILING unsigned long long end = current_time_ns(); if (shmem_my_pe() == 0) printf("Bucketizing took %llu ns\n", end - start); #endif timer_stop(&timers[TIMER_BUCKETIZE]); #ifdef DEBUG wait_my_turn(); char msg[1024]; const int my_rank = shmem_my_pe(); sprintf(msg,"Rank %d: local bucketed keys: ", my_rank); for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", my_local_bucketed_keys[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return my_local_bucketed_keys; }
/* * Computes the size of each bucket by iterating all keys and incrementing * their corresponding bucket's size */ static int * count_local_bucket_sizes(KEY_TYPE const * const my_keys) { int * const local_bucket_sizes = malloc(NUM_BUCKETS * sizeof(int)); assert(local_bucket_sizes); timer_start(&timers[TIMER_BCOUNT]); init_array(local_bucket_sizes, NUM_BUCKETS); #ifdef ISX_PROFILING unsigned long long start = current_time_ns(); #endif for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){ const uint32_t bucket_index = my_keys[i]/BUCKET_WIDTH; local_bucket_sizes[bucket_index]++; } #ifdef ISX_PROFILING unsigned long long end = current_time_ns(); if (shmem_my_pe() == 0) printf("Counting local bucket sizes took %llu ns\n", end - start); #endif timer_stop(&timers[TIMER_BCOUNT]); #ifdef DEBUG wait_my_turn(); char msg[1024]; const int my_rank = shmem_my_pe(); sprintf(msg,"Rank %d: local bucket sizes: ", my_rank); for(uint64_t i = 0; i < NUM_BUCKETS; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", local_bucket_sizes[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return local_bucket_sizes; }
/* * Generates uniformly random keys [0, MAX_KEY_VAL] on each rank using the time and rank * number as a seed */ static KEY_TYPE * make_input(void) { timer_start(&timers[TIMER_INPUT]); KEY_TYPE * const my_keys = malloc(NUM_KEYS_PER_PE * sizeof(KEY_TYPE)); assert(my_keys); pcg32_random_t rng = seed_my_rank(); #ifdef ISX_PROFILING unsigned long long start = current_time_ns(); #endif for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i) { my_keys[i] = pcg32_boundedrand_r(&rng, MAX_KEY_VAL); } #ifdef ISX_PROFILING unsigned long long end = current_time_ns(); if (shmem_my_pe() == 0) printf("Making input took %llu ns\n", end - start); #endif timer_stop(&timers[TIMER_INPUT]); #ifdef DEBUG wait_my_turn(); char msg[1024]; const int my_rank = shmem_my_pe(); sprintf(msg,"Rank %d: Initial Keys: ", my_rank); for(uint64_t i = 0; i < NUM_KEYS_PER_PE; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", my_keys[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return my_keys; }
/* * Computes the prefix scan of the bucket sizes to determine the starting locations * of each bucket in the local bucketed array * Stores a copy of the bucket offsets for use in exchanging keys because the * original bucket_offsets array is modified in the bucketize function */ static int * compute_local_bucket_offsets(int const * const local_bucket_sizes, int ** send_offsets) { int * const local_bucket_offsets = malloc(NUM_BUCKETS * sizeof(int)); assert(local_bucket_offsets); timer_start(&timers[TIMER_BOFFSET]); (*send_offsets) = malloc(NUM_BUCKETS * sizeof(int)); assert(*send_offsets); local_bucket_offsets[0] = 0; (*send_offsets)[0] = 0; int temp = 0; for(uint64_t i = 1; i < NUM_BUCKETS; i++){ temp = local_bucket_offsets[i-1] + local_bucket_sizes[i-1]; local_bucket_offsets[i] = temp; (*send_offsets)[i] = temp; } timer_stop(&timers[TIMER_BOFFSET]); #ifdef DEBUG wait_my_turn(); char msg[1024]; const int my_rank = shmem_my_pe(); sprintf(msg,"Rank %d: local bucket offsets: ", my_rank); for(uint64_t i = 0; i < NUM_BUCKETS; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", local_bucket_offsets[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return local_bucket_offsets; }
/* * Each PE sends the contents of its local buckets to the PE that owns that bucket. */ static KEY_TYPE * exchange_keys(int const * const send_offsets, int const * const local_bucket_sizes, KEY_TYPE const * const my_local_bucketed_keys) { timer_start(&timers[TIMER_ATA_KEYS]); const int my_rank = shmem_my_pe(); unsigned int total_keys_sent = 0; // Keys destined for local key buffer can be written with memcpy const long long int write_offset_into_self = shmem_longlong_fadd( &receive_offset, (long long int)local_bucket_sizes[my_rank], my_rank); assert((unsigned long long)write_offset_into_self + (unsigned long long)local_bucket_sizes[my_rank] <= KEY_BUFFER_SIZE); memcpy(&my_bucket_keys[write_offset_into_self], &my_local_bucketed_keys[send_offsets[my_rank]], local_bucket_sizes[my_rank]*sizeof(KEY_TYPE)); for(uint64_t i = 0; i < NUM_PES; ++i){ #ifdef PERMUTE const int target_pe = permute_array[i]; #elif INCAST const int target_pe = i; #else const int target_pe = (my_rank + i) % NUM_PES; #endif // Local keys already written with memcpy if(target_pe == my_rank){ continue; } const int read_offset_from_self = send_offsets[target_pe]; const int my_send_size = local_bucket_sizes[target_pe]; const long long int write_offset_into_target = shmem_longlong_fadd( &receive_offset, (long long int)my_send_size, target_pe); #ifdef DEBUG printf("Rank: %d Target: %d Offset into target: %lld Offset into myself: %d Send Size: %d\n", my_rank, target_pe, write_offset_into_target, read_offset_from_self, my_send_size); #endif // fprintf(stderr, "PUTTING %llu\n", my_send_size); assert((unsigned long long)write_offset_into_target + (unsigned long long)my_send_size <= KEY_BUFFER_SIZE); assert((unsigned long long)read_offset_from_self + (unsigned long long)my_send_size <= NUM_KEYS_PER_PE); shmem_int_put(&(my_bucket_keys[write_offset_into_target]), &(my_local_bucketed_keys[read_offset_from_self]), my_send_size, target_pe); total_keys_sent += my_send_size; } #ifdef BARRIER_ATA SHMEM_BARRIER_AT_EXCHANGE; #endif timer_stop(&timers[TIMER_ATA_KEYS]); timer_count(&timers[TIMER_ATA_KEYS], total_keys_sent); #ifdef DEBUG wait_my_turn(); char msg[1024]; sprintf(msg,"Rank %d: Bucket Size %lld | Total Keys Sent: %u | Keys after exchange:", my_rank, receive_offset, total_keys_sent); for(long long int i = 0; i < receive_offset; ++i){ if(i < PRINT_MAX) sprintf(msg + strlen(msg),"%d ", my_bucket_keys[i]); } sprintf(msg + strlen(msg),"\n"); printf("%s",msg); fflush(stdout); my_turn_complete(); #endif return my_bucket_keys; }